diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ff3ec5..2fc32d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,10 +8,30 @@ on: jobs: ci: strategy: + fail-fast: false matrix: include: + # default static build, both arches - runs_on: ubicloud-standard-30 + variant: default + target: "" + build_args: | + ENABLE_FDKAAC=1 - runs_on: ubicloud-standard-30-arm + variant: default + target: "" + build_args: | + ENABLE_FDKAAC=1 + # CUDA variant (NVENC/NVDEC/CUVID), amd64 only for now. + # No GPU on the runner — the build only verifies that the binary + # links and that nvenc/cuvid/cuda show up in -encoders/-hwaccels. + - runs_on: ubicloud-standard-30 + variant: cuda + target: final-cuda + build_args: | + ENABLE_FDKAAC=1 + ENABLE_CUDA=1 + name: ci (${{ matrix.variant }} / ${{ matrix.runs_on }}) runs-on: ${{ matrix.runs_on }} steps: - uses: actions/checkout@v3 @@ -21,7 +41,7 @@ jobs: with: context: . push: false - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - ENABLE_FDKAAC=1 + cache-from: type=gha,scope=${{ matrix.variant }}-${{ matrix.runs_on }} + cache-to: type=gha,mode=max,scope=${{ matrix.variant }}-${{ matrix.runs_on }} + target: ${{ matrix.target }} + build-args: ${{ matrix.build_args }} diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml index 2037ccc..26f15a7 100644 --- a/.github/workflows/multiarch.yml +++ b/.github/workflows/multiarch.yml @@ -12,28 +12,82 @@ env: REGISTRY_IMAGE: mwader/static-ffmpeg jobs: - build: - name: Build image - strategy: - matrix: - include: - - runs_on: ubicloud-standard-8-arm - tag: arm64 - - runs_on: ubuntu-latest - tag: amd64 + # arm64 default — independent, runs in parallel with amd64. + build-default-arm64: + name: Build image (default / arm64) + runs-on: ubicloud-standard-8-arm + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker build + uses: docker/build-push-action@v6 + with: + context: . + tags: image:default-arm64 + load: true + cache-from: type=gha,scope=builder-arm64 + cache-to: type=gha,scope=builder-arm64,mode=max + - name: Docker save + run: docker image save --output image-default-arm64.tar image:default-arm64 + - uses: actions/upload-artifact@v4 + with: + name: image-default-arm64 + path: image-default-arm64.tar + retention-days: 1 + + # amd64 default — populates the shared builder-amd64 cache scope. + build-default-amd64: + name: Build image (default / amd64) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker build + uses: docker/build-push-action@v6 + with: + context: . + tags: image:default-amd64 + load: true + cache-from: type=gha,scope=builder-amd64 + cache-to: type=gha,scope=builder-amd64,mode=max + - name: Docker save + run: docker image save --output image-default-amd64.tar image:default-amd64 + - uses: actions/upload-artifact@v4 + with: + name: image-default-amd64 + path: image-default-amd64.tar + retention-days: 1 - runs-on: ${{ matrix.runs_on }} + # CUDA variant (NVENC/NVDEC/CUVID), amd64 only. + # Runs *after* default-amd64 so it reuses the populated builder-amd64 + # cache scope: every builder layer up to the final stage is a cache hit, + # and only the final-cuda stage has to materialize. + build-cuda-amd64: + name: Build image (cuda / amd64) + runs-on: ubuntu-latest + needs: build-default-amd64 steps: - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Docker build - run: docker build --tag image:${{ matrix.tag }} . + uses: docker/build-push-action@v6 + with: + context: . + target: final-cuda + build-args: ENABLE_CUDA=1 + tags: image:cuda-amd64 + load: true + cache-from: type=gha,scope=builder-amd64 + cache-to: type=gha,scope=builder-amd64,mode=max - name: Docker save - run: docker image save --output image-${{ matrix.tag }}.tar image:${{ matrix.tag }} - - name: Upload Docker image-${{ matrix.tag }} - uses: actions/upload-artifact@v4 + run: docker image save --output image-cuda-amd64.tar image:cuda-amd64 + - uses: actions/upload-artifact@v4 with: - name: image-${{ matrix.tag }} - path: image-${{ matrix.tag }}.tar + name: image-cuda-amd64 + path: image-cuda-amd64.tar retention-days: 1 tag: @@ -53,22 +107,23 @@ jobs: ' >> "$GITHUB_OUTPUT" merge: - name: Merge and push images + name: Merge and push default images runs-on: ubuntu-latest needs: - - build + - build-default-arm64 + - build-default-amd64 - tag steps: - name: Download digests uses: actions/download-artifact@v4 with: path: /tmp - pattern: image-* + pattern: image-default-* merge-multiple: true - name: Load Docker images run: | - docker image load --input /tmp/image-arm64.tar - docker image load --input /tmp/image-amd64.tar + docker image load --input /tmp/image-default-arm64.tar + docker image load --input /tmp/image-default-amd64.tar - name: Docker meta id: meta uses: docker/metadata-action@v5 @@ -81,8 +136,8 @@ jobs: password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Create manifest list and push run: | - docker tag image:arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 - docker tag image:amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 + docker tag image:default-arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 + docker tag image:default-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker manifest create \ @@ -91,3 +146,32 @@ jobs: --amend ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker manifest inspect ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }} docker manifest push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }} + + merge-cuda: + name: Push CUDA image (amd64 only) + runs-on: ubuntu-latest + needs: + - build-cuda-amd64 + - tag + steps: + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp + pattern: image-cuda-* + merge-multiple: true + - name: Load Docker image + run: docker image load --input /tmp/image-cuda-amd64.tar + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Tag and push CUDA image + run: | + # CUDA variant is amd64-only for now; published as a single-arch tag. + docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64 + docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda + docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64 + docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda + diff --git a/Dockerfile b/Dockerfile index 1551539..4602cf6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1037,6 +1037,25 @@ RUN \ --enable-static && \ make -j$(nproc) install +# NVIDIA codec headers (header-only; no CUDA toolkit needed). ffmpeg dlopen()s the +# real driver libs (libcuda / libnvcuvid / libnvidia-encode) at runtime, injected +# by the NVIDIA Container Toolkit. Only built when ENABLE_CUDA is set. +# See docs/ffmpeg-with-cuda.md. +# bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13 +# bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST +# bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases +ARG FFNVCODEC_VERSION=13.0.19.0 +ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz" +ARG FFNVCODEC_SHA256=86d15d1a7c0ac73a0eafdfc57bebfeba7da8264595bf531cf4d8db1c22940116 +ARG ENABLE_CUDA= +RUN \ + if [ -n "$ENABLE_CUDA" ]; then \ + wget $WGET_OPTS -O ffnvcodec.tar.gz "$FFNVCODEC_URL" && \ + echo "$FFNVCODEC_SHA256 ffnvcodec.tar.gz" | sha256sum -c - && \ + tar $TAR_OPTS ffnvcodec.tar.gz && cd nv-codec-headers-* && \ + make PREFIX=/usr/local install ; \ + fi + # requires libdrm # bump: libva /LIBVA_VERSION=([\d.]+)/ https://github.com/intel/libva.git|^2 # bump: libva after ./hashupdate Dockerfile LIBVA $LATEST @@ -1112,91 +1131,113 @@ ARG FFMPEG_VERSION=8.1 ARG FFMPEG_URL="https://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2" ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27eda2 ARG ENABLE_FDKAAC= -# sed changes --toolchain=hardened -pie to -static-pie +# sed changes --toolchain=hardened -pie to -static-pie (default build only). # -# ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something -# more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass -# large things on the stack. +# CUDA variant: keep -pie (musl dynamic-PIE) so ffnvcodec can dlopen() the +# NVIDIA driver libs. All other deps stay statically archived; only the musl +# loader/libc is dynamic. See docs/ffmpeg-with-cuda.md. # -# ldfalgs -Wl,--allow-multiple-definition is a workaround for linking with multiple rust staticlib to -# not cause collision in toolchain symbols, see comment in checkdupsym script for details. +# ldflags stack-size=2097152 raises musl's 128KB default to ~glibc 2MB +# (libaom/libsvtav1 pass large objects on the stack). +# ldflags --allow-multiple-definition works around rust staticlib toolchain +# symbol collisions (see checkdupsym). RUN \ wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \ echo "$FFMPEG_SHA256 ffmpeg.tar.bz2" | sha256sum -c - && \ tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \ FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \ - sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure && \ + CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \ + if [[ -z "$ENABLE_CUDA" ]]; then \ + # Default: fully static-pie musl binary, no loader, no dlopen. + sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \ + EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" ; \ + EXTRA_LIBS="" ; \ + else \ + # CUDA: musl dynamic-PIE. Link the dynamic libc by ABSOLUTE PATH (not -lc) + # to avoid musl's libc.a 25-byte dlopen() stub that always returns NULL — + # gcc's hardened toolchain can otherwise resolve dlopen/dlsym/dlerror from + # the static archive even when -Bdynamic is requested, breaking nvenc with + # a silent "Cannot load libcuda.so.1" (no openat syscall fires). + # See docs/ffmpeg-with-cuda.md (P1). + EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ + -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc" ; \ + EXTRA_LIBS="-lgomp -Wl,-Bdynamic -lc" ; \ + fi && \ ./configure \ - --pkg-config-flags="--static" \ - --extra-cflags="-fopenmp" \ - --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" \ - --toolchain=hardened \ - --disable-debug \ - --disable-shared \ - --disable-ffplay \ - --enable-static \ - --enable-gpl \ - --enable-version3 \ - $FDKAAC_FLAGS \ - --enable-fontconfig \ - --enable-gray \ - --enable-iconv \ - --enable-lcms2 \ - --enable-libaom \ - --enable-libaribb24 \ - --enable-libass \ - --enable-libbluray \ - --enable-libdav1d \ - --enable-libdavs2 \ - --enable-libfreetype \ - --enable-libfribidi \ - --enable-libgme \ - --enable-libgsm \ - --enable-libharfbuzz \ - --enable-libjxl \ - --enable-libkvazaar \ - --enable-libmodplug \ - --enable-libmp3lame \ - --enable-libmysofa \ - --enable-libopencore-amrnb \ - --enable-libopencore-amrwb \ - --enable-libopenjpeg \ - --enable-libopus \ - --enable-librabbitmq \ - --enable-librav1e \ - --enable-librsvg \ - --enable-librtmp \ - --enable-librubberband \ - --enable-libshine \ - --enable-libsnappy \ - --enable-libsoxr \ - --enable-libspeex \ - --enable-libsrt \ - --enable-libssh \ - --enable-libsvtav1 \ - --enable-libtheora \ - --enable-libtwolame \ - --enable-libuavs3d \ - --enable-libvidstab \ - --enable-libvmaf \ - --enable-libvo-amrwbenc \ - --enable-libvorbis \ - --enable-libvpl \ - --enable-libvpx \ - --enable-libvvenc \ - --enable-libwebp \ - --enable-libx264 \ - --enable-libx265 \ - --enable-libxavs2 \ - --enable-libxevd \ - --enable-libxeve \ - --enable-libxml2 \ - --enable-libxvid \ - --enable-libzimg \ - --enable-libzmq \ - --enable-openssl \ - || (cat ffbuild/config.log ; false) \ - && make -j$(nproc) install + --pkg-config-flags="--static" \ + --extra-cflags="-fopenmp" \ + --extra-ldflags="$EXTRA_LDFLAGS" \ + --extra-libs="$EXTRA_LIBS" \ + --toolchain=hardened \ + --disable-debug \ + --disable-shared \ + --disable-ffplay \ + --enable-static \ + --enable-gpl \ + --enable-version3 \ + $FDKAAC_FLAGS \ + $CUDA_FLAGS \ + --enable-fontconfig \ + --enable-gray \ + --enable-iconv \ + --enable-lcms2 \ + --enable-libaom \ + --enable-libaribb24 \ + --enable-libass \ + --enable-libbluray \ + --enable-libdav1d \ + --enable-libdavs2 \ + --enable-libfreetype \ + --enable-libfribidi \ + --enable-libgme \ + --enable-libgsm \ + --enable-libharfbuzz \ + --enable-libjxl \ + --enable-libkvazaar \ + --enable-libmodplug \ + --enable-libmp3lame \ + --enable-libmysofa \ + --enable-libopencore-amrnb \ + --enable-libopencore-amrwb \ + --enable-libopenjpeg \ + --enable-libopus \ + --enable-librabbitmq \ + --enable-librav1e \ + --enable-librsvg \ + --enable-librtmp \ + --enable-librubberband \ + --enable-libshine \ + --enable-libsnappy \ + --enable-libsoxr \ + --enable-libspeex \ + --enable-libsrt \ + --enable-libssh \ + --enable-libsvtav1 \ + --enable-libtheora \ + --enable-libtwolame \ + --enable-libuavs3d \ + --enable-libvidstab \ + --enable-libvmaf \ + --enable-libvo-amrwbenc \ + --enable-libvorbis \ + --enable-libvpl \ + --enable-libvpx \ + --enable-libvvenc \ + --enable-libwebp \ + --enable-libx264 \ + --enable-libx265 \ + --enable-libxavs2 \ + --enable-libxevd \ + --enable-libxeve \ + --enable-libxml2 \ + --enable-libxvid \ + --enable-libzimg \ + --enable-libzmq \ + --enable-openssl \ + || (cat ffbuild/config.log ; false) && \ + make -j$(nproc) install RUN \ EXPAT_VERSION=$(pkg-config --modversion expat) \ @@ -1273,13 +1314,17 @@ RUN \ libzimg: env.ZIMG_VERSION, \ libzmq: env.LIBZMQ_VERSION, \ openssl: env.OPENSSL_VERSION, \ + ffnvcodec: env.FFNVCODEC_VERSION, \ }' > /versions.json -# make sure binaries has no dependencies, is relro, pie and stack nx +# make sure binaries has no dependencies, is relro, pie and stack nx. +# CUDA build is musl dynamic-PIE; --cuda allows the musl loader/libc as the +# only NEEDED entry. COPY checkelf / RUN \ - /checkelf /usr/local/bin/ffmpeg && \ - /checkelf /usr/local/bin/ffprobe + CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \ + /checkelf $CHECKELF_FLAGS /usr/local/bin/ffmpeg && \ + /checkelf $CHECKELF_FLAGS /usr/local/bin/ffprobe # workaround for using -Wl,--allow-multiple-definition # see comment in checkdupsym for details COPY checkdupsym / @@ -1320,6 +1365,150 @@ RUN ["/ffmpeg", "-f", "lavfi", "-i", "testsrc", "-c:v", "libx265", "-t", "100ms" FROM scratch AS final2 COPY --from=final1 / / -FROM final2 +FROM final2 AS final +LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" +ENTRYPOINT ["/ffmpeg"] + +# CUDA / NVENC / NVDEC variant. See docs/ffmpeg-with-cuda.md for full design. +# +# Build: docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:-cuda . +# Run: docker run --gpus all --rm mwader/static-ffmpeg:-cuda \ +# -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4 +# +# Requires NVIDIA driver on host + nvidia-container-toolkit. The binary is a musl +# dynamic-PIE so the loader is present and the NVIDIA driver libs (libcuda.so.1, +# libnvcuvid.so, libnvidia-encode.so) injected by the toolkit can be dlopen()'d. +# No CUDA toolkit needed at build or run time. +# +# --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit). +# Use scale_cuda instead of scale_npp. +FROM alpine:3.20.3 AS final-cuda1 +# Real ffmpeg ELF lives at /ffmpeg.bin; /ffmpeg is the bash wrapper (added below) +# that execs it. This way `COPY --from=...:cuda /ffmpeg /ffmpeg.bin /` from +# a downstream image gives a drop-in /ffmpeg that already includes the +# teardown-SIGSEGV workaround — no custom ENTRYPOINT needed. +COPY --from=builder /usr/local/bin/ffmpeg /ffmpeg.bin +COPY --from=builder /usr/local/bin/ffprobe / +COPY --from=builder /versions.json / +COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/ +COPY --from=builder /etc/ssl/cert.pem /etc/ssl/cert.pem +COPY --from=builder /etc/fonts/ /etc/fonts/ +COPY --from=builder /usr/share/fonts/ /usr/share/fonts/ +COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/ +COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/ + +# gcompat: glibc->musl shim. NVIDIA driver libs are built against glibc and have +# DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 / +# librt.so.1 — gcompat provides those SONAMEs as musl wrappers. libstdc++ is +# pulled in for NVIDIA helper libs (e.g. libnvidia-ngx). gcompat omits libdl.so.2 +# (musl folds dlopen into libc) so symlink it manually. +RUN apk add --no-cache gcompat libstdc++ && \ + ln -sf libgcompat.so.0 /lib/libdl.so.2 + +# nvshim: tiny LD_PRELOAD library exporting glibc-internal symbols that gcompat +# does NOT provide but the real NVIDIA driver backend calls during cuInit(). +# Without these, the stub libcuda dlopens fine but its backend fails with +# "Error relocating: : symbol not found", which ffmpeg surfaces as the +# misleading "Cannot load libcuda.so.1". +# +# IMPORTANT: this shim must NOT interpose exit / _exit / _Exit. Doing so +# silently swallows ffmpeg's real exit codes (every error returns 0). +# Process-lifecycle policy belongs in the bash entrypoint wrapper below. +# See docs/ffmpeg-with-cuda.md (P6). +RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ + mkdir -p /usr/local/lib && \ + printf '%s\n' \ + '#define _GNU_SOURCE' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + 'const char *gnu_get_libc_version(void) { return "2.35"; }' \ + 'const char *gnu_get_libc_release(void) { return "stable"; }' \ + 'int __libc_current_sigrtmin(void) { return SIGRTMIN; }' \ + 'int __libc_current_sigrtmax(void) { return SIGRTMAX; }' \ + 'int __register_atfork(void (*p)(void), void (*pa)(void), void (*c)(void), void *dso) {' \ + ' (void)dso; return pthread_atfork(p, pa, c);' \ + '}' \ + 'int __libc_single_threaded = 0;' \ + 'int __cxa_thread_atexit_impl(void (*f)(void*), void *o, void *dso) {' \ + ' (void)f; (void)o; (void)dso; return 0;' \ + '}' \ + 'char *secure_getenv(const char *name) { return getenv(name); }' \ + '/* dlmopen: glibc-only namespaced dlopen; musl has no link namespaces. */' \ + 'typedef long Lmid_t;' \ + 'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \ + ' (void)lmid; return dlopen(file, mode);' \ + '}' \ + '/* glibc-internal dl* variants used by nss / driver init. */' \ + 'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \ + 'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \ + 'int __libc_dlclose(void *handle) { return dlclose(handle); }' \ + '/* dlvsym: glibc versioned dlsym; musl has no symbol versioning. */' \ + 'void *dlvsym(void *handle, const char *name, const char *version) {' \ + ' (void)version; return dlsym(handle, name);' \ + '}' \ + > /tmp/nvshim.c && \ + gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \ + rm /tmp/nvshim.c && \ + apk del .nvshim-build + +# musl loader fallback search path. The NVIDIA Container Toolkit injects driver +# libs into one of these depending on host distro; musl's defaults +# (/lib:/usr/local/lib:/usr/lib) miss all three. +RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \ + > /etc/ld-musl-x86_64.path + + +# Entrypoint wrapper installed AS /ffmpeg itself: convert the benign teardown +# SIGSEGV (139 -> 0) that libcuda's __cxa_finalize triggers under musl + gcompat. +# The crash happens inside main() after the encode is complete and all output is +# flushed, so no in-process hook can suppress it. Heuristic: only downgrade 139 +# when stderr contains no recognisable error keyword. Real failure exit codes +# (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5). +RUN apk add --no-cache bash && \ + printf '%s\n' \ + '#!/bin/bash' \ + 'errfile=$(mktemp)' \ + 'shellerr=$(mktemp)' \ + 'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \ + 'exec 3>&1' \ + 'exec 4>&2' \ + 'exec 2>"$shellerr"' \ + '{ /ffmpeg.bin "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \ + 'rc=${PIPESTATUS[0]}' \ + 'exec 3>&-' \ + 'exec 2>&4 4>&-' \ + 'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \ + 'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \ + ' exit 0' \ + 'fi' \ + 'exit "$rc"' \ + > /ffmpeg && \ + chmod +x /ffmpeg + +# sanity tests (cannot exercise actual GPU encode without a GPU at build time). +# /ffmpeg goes through the wrapper -> /ffmpeg.bin; both must work. +RUN ["/ffmpeg", "-version"] +RUN ["/ffmpeg.bin", "-version"] +RUN ["/ffprobe", "-version"] +RUN ["/ffmpeg", "-hide_banner", "-buildconf"] +RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda +RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc +RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid + +# clamp all files into one layer +FROM scratch AS final-cuda2 +COPY --from=final-cuda1 / / + +FROM final-cuda2 AS final-cuda LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" +# Default toolkit env so callers only need `--gpus all`. +# compute -> libcuda.so.1 ; video -> libnvcuvid + libnvidia-encode (NVENC/NVDEC) ; +# utility -> libnvidia-ml + nvidia-smi. +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \ + LD_PRELOAD=/usr/local/lib/libnvshim.so ENTRYPOINT ["/ffmpeg"] diff --git a/README.md b/README.md index 59715de..c7ef754 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent - [libzimg](https://github.com/sekrit-twc/zimg) - [libzmq](https://github.com/zeromq/libzmq) - [openssl](https://openssl.org) +- NVIDIA NVENC / NVDEC / CUVID via [nv-codec-headers](https://github.com/FFmpeg/nv-codec-headers) (only in the CUDA variant, [see below](#cuda--nvenc--nvdec-nvidia-gpu-acceleration)) - and all native ffmpeg codecs, formats, filters etc. ### Files in the image @@ -114,6 +115,10 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent `MAJOR.MINOR.PATCH[-BUILD]` Specific version of FFmpeg with the features that was in master at the time of tagging. `-BUILD` means that was an additional build with that version to add of fix something. +`-cuda` (and `latest-cuda`) — same FFmpeg version compiled with NVIDIA +NVENC / NVDEC / CUVID support, see [CUDA / NVENC / NVDEC](#cuda--nvenc--nvdec-nvidia-gpu-acceleration) +below. Currently amd64 only (published as `-cuda` → `-cuda-amd64`). + ### Security Binaries are built with various hardening features but it's *still a good idea to run them @@ -126,6 +131,160 @@ Due to license issues the docker image does not include libfdk-aac by default. A docker build --build-arg ENABLE_FDKAAC=1 . -t my-ffmpeg-static:latest ``` +### CUDA / NVENC / NVDEC (NVIDIA GPU acceleration) + +The default image is fully static and does **not** support NVIDIA GPU acceleration +(a fully static-pie musl binary has no dynamic loader, so it cannot `dlopen()` the +NVIDIA driver libraries at runtime). + +A separate **CUDA variant** can be built that includes `ffnvcodec`, `nvenc`, +`nvdec` and `cuvid` support. In this variant the binary is a *musl dynamic-PIE* +(all FFmpeg dependencies remain statically archived; only the musl loader / libc +stays dynamic) so that FFmpeg can `dlopen()` the NVIDIA driver libs +(`libcuda.so.1`, `libnvcuvid.so`, `libnvidia-encode.so`) which the +[NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) +injects into the container at runtime via `--gpus all`. + +No CUDA toolkit is needed to build or to run — only header-only +[`nv-codec-headers`](https://github.com/FFmpeg/nv-codec-headers) at build time +and the host's NVIDIA driver at run time. + +#### Build + +```sh +docker build --build-arg ENABLE_CUDA=1 --target final-cuda \ + -t my-ffmpeg-static:cuda . +``` + +#### Run + +Requires the NVIDIA driver on the host and `nvidia-container-toolkit` installed +and configured in Docker. + +```sh +docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" my-ffmpeg-static:cuda \ + -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \ + -c:a copy -c:v h264_nvenc -b:v 5M output.mp4 +``` + +Verify GPU support inside the container: + +```sh +docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -hwaccels +docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -encoders | grep nvenc +``` + +Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent). +Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …). + +#### Use in another image with `COPY --from` + +Unlike the default static binary, the CUDA variant has runtime dependencies +beyond the binary itself. To get a working NVENC/NVDEC build in your own +image you need to copy **all** of the following from `:8.1-cuda`: + +```Dockerfile +FROM alpine:3.20 + +# 1. The binaries. /ffmpeg in the cuda image is a bash wrapper that execs +# /ffmpeg.bin (the real ELF) — it downgrades the benign teardown SIGSEGV +# (exit 139 → 0) while preserving real ffmpeg exit codes. Both files must +# be copied; the wrapper expects to find /ffmpeg.bin at the same root. +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg /ffmpeg +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg.bin /ffmpeg.bin +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffprobe /usr/local/bin/ + +# 2. musl loader path file — adds /usr/lib64, /usr/lib/wsl/lib, etc. so musl +# can find the toolkit-injected NVIDIA driver libs. +COPY --from=mwader/static-ffmpeg:8.1-cuda /etc/ld-musl-x86_64.path /etc/ld-musl-x86_64.path + +# 3. The glibc → musl ABI shim (LD_PRELOAD'd into ffmpeg). +COPY --from=mwader/static-ffmpeg:8.1-cuda /usr/local/lib/libnvshim.so /usr/local/lib/ + +# 4. gcompat + bash + the libdl.so.2 → libgcompat.so.0 symlink the NVIDIA +# driver libs need at DT_NEEDED resolution time. bash is required by the +# /ffmpeg wrapper script. +RUN apk add --no-cache gcompat libstdc++ bash && \ + ln -sf /usr/lib/libgcompat.so.0 /usr/lib/libdl.so.2 + +# 5. Toolkit env (compute → libcuda.so.1, video → libnvcuvid/libnvidia-encode). +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \ + LD_PRELOAD=/usr/local/lib/libnvshim.so + +ENTRYPOINT ["/ffmpeg"] +``` + +Notes: + +- The base image **must be Alpine** (or otherwise musl-based with a compatible + musl major version). Glibc-based images — including `debian:*-slim` + (e.g. `bookworm-slim`), `ubuntu:*`, `python:*-slim`, `nvidia/cuda:*`, + `redhat/ubi*`, etc. — are **not** supported destinations: the binary's + `PT_INTERP` is `/lib/ld-musl-x86_64.so.1`, which doesn't exist on those + distros, and the `gcompat` shim in step 4 is Alpine-only. If you need a + Debian/Ubuntu runtime, run the published `mwader/static-ffmpeg:-cuda` + image directly (it's already Alpine-based) instead of `COPY --from`'ing + into a glibc base. +- Skipping any of items 2–5 will produce a binary that builds and runs + `-version` fine but fails at the first NVENC/NVDEC call. +- Run with `--gpus all` (and the NVIDIA Container Toolkit installed on the + host) for GPU access — same as running `mwader/static-ffmpeg:8.1-cuda` + directly. + +##### Multi-process images (Python / Node / app + ffmpeg) + +The example above sets `LD_PRELOAD=/usr/local/lib/libnvshim.so` as image-wide +`ENV`. That's safe in an **ffmpeg-only** image (the published `:*-cuda` image +runs only `/ffmpeg`, which was built and tested with the shim preloaded), but +it is **not** safe in an image that also runs other musl binaries — `pip`, +`python`, `node`, your app, etc. `libnvshim.so` exports glibc-only symbols and +transitively pulls in `gcompat` (via `DT_NEEDED libdl.so.2`). Forcing that +into every process tends to crash CPython and other musl interpreters with +`SIGSEGV` (exit code 139) at startup. + +For multi-process images, scope the preload to ffmpeg only with a small +wrapper instead of `ENV LD_PRELOAD`: + +```Dockerfile +# Replace step 5's `LD_PRELOAD=...` ENV line with a wrapper that sets +# LD_PRELOAD only for the ffmpeg process. Other processes (pip, python, +# sh, ...) run with a clean environment. The wrapper at /usr/local/bin/ffmpeg +# also exposes ffmpeg on PATH for your app to call as `ffmpeg`. +RUN printf '%s\n' \ + '#!/bin/sh' \ + 'exec env LD_PRELOAD=/usr/local/lib/libnvshim.so /ffmpeg "$@"' \ + > /usr/local/bin/ffmpeg \ + && chmod +x /usr/local/bin/ffmpeg + +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility,video +# (no ENV LD_PRELOAD here) +``` + +`/usr/local/bin/ffmpeg` (the wrapper) execs `/ffmpeg` (the static-ffmpeg bash +wrapper that downgrades the benign teardown SIGSEGV) which execs +`/ffmpeg.bin` (the real ELF). Exit codes propagate unchanged via `exec`. Your +app continues to call `ffmpeg` from `PATH` as normal. + +If you also invoke `ffprobe` against CUDA-accelerated decoders and see it +crash, wrap it the same way (rename the copied binary to `ffprobe.bin` first +and put the wrapper at `/usr/local/bin/ffprobe`). For most ffprobe use cases +this isn't needed. + +#### Limitations + +- `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require + the full glibc-based CUDA toolkit and would defeat the static/musl design. + Use `scale_cuda` instead of `scale_npp` for GPU resizing. +- The CUDA variant is **not fully static**. The binary depends on the musl + loader/libc that ship in the `alpine` base of the `final-cuda` stage. If you + copy the binary into another image, that image must provide a compatible + musl libc (i.e. an Alpine-based image of the matching `musl` major version). +- Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary + still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime. +- amd64 only. + ### Fonts usage with SVG or draw text filters etc The image ships with some basic fonts (`font-terminus font-inconsolata font-dejavu font-awesome`) that can be used when running the image directly. If your copying the binaries into some image you have to install fonts somehow. How to do this depends a bit on distributions but in general look for font packages and how to make [fontconfig](https://www.freedesktop.org/wiki/Software/fontconfig/) know about them. @@ -288,6 +447,5 @@ usage and potential distribution of such. - Add libopenapv - Add libplacebo, chromaprint, etc. ... -- Add acceleration support (GPU, CUDA, ...) - Add *.a *.so libraries, headers and pkg-config somehow diff --git a/checkelf b/checkelf index b4233b4..d500d42 100755 --- a/checkelf +++ b/checkelf @@ -1,14 +1,35 @@ #!/bin/sh set -eu -NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no) +# Usage: checkelf [--cuda] +# +# In default mode the binary must have NO external library deps (fully static-pie). +# In --cuda mode the binary is a musl dynamic-PIE: only the musl loader and libc +# (which are the same .so) are allowed as NEEDED entries, so that ffmpeg can +# dlopen() the NVIDIA driver libs (libcuda.so.1, libnvcuvid.so, libnvidia-encode.so) +# at runtime when the container is started with `--gpus all`. + +MODE=default +if [ "${1:-}" = "--cuda" ]; then + MODE=cuda + shift +fi + +if [ "$MODE" = "cuda" ]; then + # Allow only the musl loader / libc lines from `ldd`. Anything else is unexpected. + EXTRA=$(ldd "$1" 2>/dev/null | grep -E -v 'ld-musl|libc\.musl|statically linked' || true) + NOEXTLIBS=$(test -z "$EXTRA" && echo yes || echo no) +else + NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no) +fi RELRO=$(readelf -l "$1" | grep -q GNU_RELRO && echo yes || echo no) BIND_NOW=$(readelf -d "$1" | grep -q BIND_NOW && echo yes || echo no) PIE=$(readelf -h "$1" | grep -q DYN && echo yes || echo no) STACKNX=$(readelf -W -l "$1" | grep GNU_STACK | grep -q -v RWE && echo yes || echo no) file "$1" -echo "No external libs: $NOEXTLIBS" +echo "Mode: $MODE" +echo "No unexpected external libs: $NOEXTLIBS" echo "Relocate read-only: $RELRO" echo "Resolve at startup: $BIND_NOW" echo "Position independent code: $PIE" diff --git a/docs/ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md new file mode 100644 index 0000000..e1286dc --- /dev/null +++ b/docs/ffmpeg-with-cuda.md @@ -0,0 +1,549 @@ +# Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg` + +**Date:** 2026-04-24 → 2026-05-03 +**Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480) +**Outcome:** Separate `:-cuda` image variant; default `:` remains a fully static-pie binary. + +--- + +## TL;DR + +| | Default `:8.1` | CUDA `:8.1-cuda` | +|---|---|---| +| Linkage | static-pie musl | musl **dynamic-PIE** (libc only) | +| `readelf -d` NEEDED | (none) | exactly one: `libc.musl-x86_64.so.1` | +| GPU | ❌ | ✅ NVENC / NVDEC / CUVID | +| Arch | amd64 + arm64 | amd64 only | +| Base image | scratch | alpine | +| ffmpeg exit codes | upstream | identical to upstream | + +The CUDA variant works on Alpine + musl by combining six independently-essential +layers (link-time + runtime). Each layer fixes one specific failure mode that +appeared during development. The layers are summarized below; full +problem → cause → fix sections follow. + +| # | Layer | Stage | Fixes | +|---|---|---|---| +| 1 | Absolute-path link of `/lib/ld-musl-x86_64.so.1` | builder | dlopen returning NULL silently (P1) | +| 2 | Dynamic-PIE link mode (`-fPIE -pie`, not `-static-pie`) | builder | dlopen impossible on static-pie (P1) | +| 3 | `/etc/ld-musl-x86_64.path` listing toolkit injection dirs | runtime | musl can't find `/usr/lib64`, `/usr/lib/wsl/lib` (P3) | +| 4 | `gcompat` package + `libdl.so.2 → libgcompat.so.0` symlink | runtime | NVIDIA driver libs need `libc.so.6` / `libdl.so.2` (P4) | +| 5 | `libnvshim.so` LD_PRELOAD (ABI-shim symbols only) | runtime | glibc-internal symbols missing from gcompat (P4) | +| 6 | Bash entrypoint wrapper (139 → 0 only) | runtime | benign teardown SIGSEGV from libcuda dtors (P5) | + +--- + +## 1. Architecture decision + +### Two separate variants, not one + +- The default `mwader/static-ffmpeg` is a fully static-pie musl binary that drops into `FROM scratch`. We must not silently break that for existing users. +- CUDA requires `dlopen()` of host driver libraries → fundamentally incompatible with `static-pie` on musl (no dynamic loader). +- CUDA users need the NVIDIA Container Toolkit and a GPU host — different deployment. +- → Different tag = explicit user opt-in + clear support boundary. + +### Build-arg `ENABLE_CUDA` + +A single `ARG ENABLE_CUDA=` controls everything: + +- Adds `nv-codec-headers` (header-only, no CUDA toolkit at build time). +- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec`. +- Switches link mode from `static-pie` to musl dynamic-PIE. +- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`. +- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs. +- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry). + +CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`. + +### Explicitly NOT supported + +| Feature | Reason | +|---|---| +| `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time | +| `--enable-libnpp` / `scale_npp` | Same — glibc-only; use `scale_cuda` instead | +| `arm64` | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack) | +| `FROM scratch` / distroless target images | No musl loader available | + +--- + +## 2. Problem → Root cause → Fix + +Each subsection records one failure mode encountered during development. + +--- + +### P1. `[h264_nvenc] Cannot load libcuda.so.1` — `dlopen()` silently returns NULL + +**Symptom.** Binary builds, `checkelf --cuda` passes, but at runtime +`dlopen("libcuda.so.1")` returns NULL. `strace -e openat` shows ffmpeg never +even attempts to open any libcuda file — no syscall fires at all. + +**Root cause.** Two independent musl traps stacked together: + +1. **`-static-pie` has no dynamic loader.** A static-pie musl binary cannot + `dlopen()` anything by definition. +2. **musl's static `libc.a` ships a 25-byte `dlopen` stub** that always returns + `NULL` with `errno=ENOSYS`. Even after switching to dynamic-PIE, gcc's + `--toolchain=hardened` spec file kept emitting late references that pulled + `libc.a` back in, restoring the stub inside the binary. The bug was + invisible to standard checks: `BIND_NOW`, `RELRO`, `PIE`, NX stack all + passed; `ldd` still showed only one extra NEEDED entry. Only + `readelf -s --dyn-syms /ffmpeg | grep dlopen` revealed: + ``` + 21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen + ``` + — `dlopen` defined inside `.text` at 25 bytes, not `UND`. + + Variants tried that did NOT fix it: + - `--extra-libs=' -lgomp -Wl,-Bdynamic -lc '` reorder — gcc spec file re-pulled `libc.a`. + - Hiding `/usr/lib/libc.a` during link — broke libgme configure-time symbol checks. + +**Fix (Layers 1 + 2).** + +1. Link mode: replace `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie`. +2. Link the musl combined loader/libc by **absolute path** in + `--extra-ldflags`, so the linker resolution is immune to `-Bstatic` / + `-Bdynamic` toggles and gcc spec-file re-emissions: + ```sh + --extra-ldflags='-fopenmp -Wl,--allow-multiple-definition \ + -Wl,-z,stack-size=2097152 \ + -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc' + --extra-libs='-lgomp -Wl,-Bdynamic -lc' + ``` + + On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc; + one absolute filename covers everything we needed `-lc` for. An absolute + filename is opened literally regardless of `-Bstatic` mode and cannot be + re-resolved against `libc.a`. + +**Verification.** +```sh +readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose' +# Each must be 0-size UND, OR not exported (resolved internally against +# the absolute-path libc — both work). The functional NVENC encode is +# the ground truth; readelf is the cheap pre-flight. +``` + +**Lesson.** Never link musl `libc.a` into a binary that calls `dlopen` — it +will silently use the stub. The `-Bdynamic -lc -Bstatic` reorder is fragile +under `--toolchain=hardened`; prefer the absolute-path form. + +--- + +### P2. `checkelf` rejects the dynamic-PIE binary + +**Symptom.** The CUDA build's hardening check rejects the binary because it +has a `NEEDED` entry (libc), whereas the default build has zero. + +**Fix.** Add `--cuda` flag to `checkelf`. In `--cuda` mode it allows the +musl loader/libc entry from `ldd` output (everything else still rejected). +All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved. + +--- + +### P3. `dlopen("libcuda.so.1")` reports "Library not found" + +**Symptom.** With driver libs actually mounted by the toolkit, +`dlopen("libcuda.so.1")` still fails with "Library not found". + +**Root cause.** musl's default loader search path is +`/lib:/usr/local/lib:/usr/lib`. The NVIDIA Container Toolkit injects driver +libs to `/usr/lib64` (RHEL/Fedora/WSL convention) or +`/usr/lib/x86_64-linux-gnu` (Debian/Ubuntu). musl also doesn't read +`/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently +ignored. + +**Fix (Layer 3).** Ship a static `/etc/ld-musl-x86_64.path`: +``` +/usr/lib/x86_64-linux-gnu +/usr/lib64 +/usr/lib/wsl/lib +/usr/lib +/usr/local/lib +/lib +``` +Listing all is safe — musl silently skips paths that don't exist. + +--- + +### P4. NVIDIA driver libs reference glibc-internal symbols missing from musl + +**Symptom.** Even with libs found, `dlopen("libcuda.so.1.1")` (the WSL2 +backend) fails with `Error relocating: : symbol not found`. Iteratively +discovered missing symbols: `gnu_get_libc_version`, `__register_atfork`, +`dlmopen`, `dlvsym`, etc. + +**Root cause.** NVIDIA driver libs are built against glibc. +`gcompat` provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / +`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds +`dlopen` into libc) and a number of glibc-internal helpers used by recent +drivers. + +**Fix (Layers 4 + 5).** + +- Install `gcompat` package. +- Symlink `libdl.so.2 → libgcompat.so.0` (driver's `DT_NEEDED libdl.so.2`). +- Build a small `libnvshim.so` exporting the missing glibc-internal symbols + and `LD_PRELOAD` it. Final shim payload: + + | Symbol | Implementation | + |---|---| + | `gnu_get_libc_version` | return `"2.35"` | + | `gnu_get_libc_release` | return `"stable"` | + | `__libc_current_sigrtmin` / `__libc_current_sigrtmax` | musl macros exposed as functions | + | `__register_atfork` | redirect to `pthread_atfork` | + | `__cxa_thread_atexit_impl` | no-op | + | `__libc_single_threaded` | data symbol, value 0 | + | `secure_getenv` | redirect to `getenv` | + | `dlmopen` | redirect to `dlopen` (ignore Lmid_t) | + | `dlvsym` | redirect to `dlsym` (ignore version) | + | `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` | wrappers | + + > **Critical: `libnvshim.so` must NOT export `exit` / `_exit` / `_Exit`.** + > See P6 — interposing those swallows ffmpeg's real exit status. + +**Maintenance note.** Each new NVIDIA driver release may reference one more +glibc-internal symbol. Diagnostic recipe in §3 finds it in <5 minutes; fix +is a one-line addition to `libnvshim.so`. + +--- + +### P5. NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown + +**Symptom.** Encode completes successfully (`frame= 60 ... muxing overhead`, +output bytes fully written), then ffmpeg exits with 139. + +**Root cause.** libcuda's `__cxa_finalize` / `DT_FINI` destructors run during +`avcodec_close → nvenc_free → cuCtxDestroy` while still inside `main()`. +Those destructors call into glibc-internal state (TLS-destructor unwinding, +pthread_atfork handlers) that musl + gcompat don't fully provide, and crash. + +Because the crash is inside `main()` (not after `exit()` is called), no +in-process hook — atexit, `LD_PRELOAD` signal handlers, etc. — can suppress +it cleanly. Attempts at in-process suppression all failed: + +| Attempt | Result | +|---|---| +| `nvshim` `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* `main()` returns; atexit never runs | +| In-process signal handler | Same — crash is in destructor before signal can dispatch | + +**Fix (Layer 6).** Out-of-process bash entrypoint wrapper that captures the +real exit code via `${PIPESTATUS[0]}` and downgrades **only** `139 → 0`, +gated on stderr containing no recognized error keyword. Real failures +(mid-encode CUDA OOM, init failures, etc.) propagate unchanged because they +always print an identifiable error first. + +```bash +#!/bin/bash +errfile=$(mktemp) +shellerr=$(mktemp) +trap "rm -f \"$errfile\" \"$shellerr\"" EXIT +exec 3>&1 +exec 4>&2 +exec 2>"$shellerr" +{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4 +rc=${PIPESTATUS[0]} +exec 3>&- +exec 2>&4 4>&- +# Filter the bash job-control "Segmentation fault (core dumped)" line. +grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true +# Suppress *only* the known-benign teardown SIGSEGV. +if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then + exit 0 +fi +exit "$rc" +``` + +ffprobe doesn't need the wrapper — it doesn't open NVENC encoders, so the +crashing destructor path isn't reached. + +--- + +### P6. ffmpeg silently exits 0 on every error path + +**Symptom.** Every fatal-error invocation of the CUDA build returned exit +code `0` to the shell, despite ffmpeg printing the correct error messages. +Verified against the non-CUDA `:8.1` baseline: + +| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) | +|---|---|---|---| +| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ | +| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| Successful encode | `0` | `0` ✅ | `0` ✅ | +| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) | + +This was masked at first by an "upgrade exit 0 → 1 when stderr matches a +fatal-error keyword" branch in the wrapper. That made tests pass with a +plausible-looking exit `1`, but it was a workaround, not a fix — the wrong +exit code (`1` instead of `8`/`254`) broke any caller that switched on the +specific code. + +**Root-cause discovery.** An `LD_PRELOAD` `dladdr` tracer interposing `_exit` +revealed that on every code path — bad-codec, bad-input, even successful +`-version` — the call to `_exit` came from `libnvshim.so`: +``` +[exittrace] _exit(0) ra=0x... dso=/usr/local/lib/libnvshim.so +``` + +`libnvshim.so` had been given an `_exit` interposer (and at one point an +`exit` interposer too) as part of the abandoned in-process attempt to +suppress the teardown SIGSEGV (P5). The interposer always invoked +`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status +and hard-coded `0`. None of the standard ELF / readelf / `nm` checks flag +this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and +musl's PLT happily binds `_exit` to whichever DSO comes first in symbol +search order — `LD_PRELOAD` always wins. + +**Fix.** Drop the `_exit` (and `exit`) overrides from `libnvshim.so` +entirely. They were never needed for any glibc→musl ABI gap (those are all +the symbols in P4). Process-lifecycle suppression belongs in the +out-of-process bash wrapper (P5), where it can read the real exit status via +`${PIPESTATUS[0]}` and pattern-match on actual error keywords. + +After removing the interposers, all standard ffmpeg exit codes match the +non-CUDA build byte-for-byte. + +**Lesson (now baked into Layer 5).** `LD_PRELOAD` shims should be the +*minimum* symbol set that closes the glibc→musl ABI gap. Any +process-lifecycle hook (exit, signal, atexit) added to such a shim will +silently apply to *every* call from the host program, not just the one +CUDA-driver call you were trying to fix. **Keep lifecycle policy +out-of-process.** + +--- + +### P7. Other small issues encountered (one-line each) + +| # | Issue | Fix | +|---|---|---| +| 1 | `nv-codec-headers` checksum mismatch | Recompute SHA256 against actual GitHub release tarball | +| 2 | ffmpeg link failed because `LDFLAGS` was set unconditionally and conflicted with `-static-pie` in non-CUDA branch | Gate the `LDFLAGS` export on `ENABLE_CUDA` only | +| 3 | Spurious dynamic deps (`libgomp`, `libdrm`, …) | Pre-link with `-Wl,-Bstatic` + `-static-libgcc -static-libstdc++` | +| 4 | Toolkit only mounted 180 KB stub `libcuda.so.1` (no `libnvcuvid` / `libnvidia-encode`) | Bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` into image | +| 5 | WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook | Host-side regression unrelated to image; `wsl --shutdown` + restart | + +--- + +## 3. Diagnostics + +### 3a. Quick image probe (link state, env, driver libs, dlopen, encode) + +```sh +IMG=mwader/static-ffmpeg:8.1-cuda +docker run --rm --gpus all --entrypoint sh "$IMG" -c ' + apk add --no-cache gcc musl-dev binutils strace >/dev/null + + echo "=== 1. Linkage ===" + ldd /ffmpeg + readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW" + + echo "=== 2. musl loader path ===" + cat /etc/ld-musl-x86_64.path + + echo "=== 3. Driver libs mounted ===" + ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \ + /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null + + echo "=== 4. Standalone dlopen + cuInit ===" + cat > /t.c < +#include +int main(void){ + void *h = dlopen("libcuda.so.1", RTLD_LAZY); + if(!h){fprintf(stderr,"FAIL: %s\n",dlerror());return 1;} + int (*ci)(unsigned)=(int(*)(unsigned))dlsym(h,"cuInit"); + fprintf(stderr,"cuInit=%d\n", ci?ci(0):-99); + return 0; +} +EOF + gcc /t.c -o /t && /t + + echo "=== 5. ffmpeg openat trace for h264_nvenc ===" + strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=size=320x240:rate=30 -t 1 \ + -c:v h264_nvenc -f null - 2>&1 | tail -3 + grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40 +' +``` + +### 3b. "Wrong exit code" regression check (guards against P6) + +```sh +docker run --rm --gpus all --entrypoint sh "$IMG" -c ' + apk add --no-cache gcc musl-dev >/dev/null + cat > /tmp/t.c < +#include +#include +#include +__attribute__((noreturn)) void _exit(int s){ + void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i); + dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?"); + syscall(SYS_exit_group,s); __builtin_unreachable(); +} +EOF + gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl + LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v this_codec_does_not_exist -f null - +' +# The traced _exit MUST show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc). +# If it shows dso=/usr/local/lib/libnvshim.so → P6 regression is back. +``` + +### 3c. dlopen-stub regression check (guards against P1) + +```sh +docker run --gpus all --rm --entrypoint sh "$IMG" -c ' + apk add --no-cache binutils >/dev/null 2>&1 + readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror|dlclose" +' +# Each must be 0-size UND (or not exported at all). A non-zero size in .text +# (e.g. " 25 FUNC ... 14 dlopen") means the static stub bug is back. +``` + +--- + +## 4. Build & verify + +### Build + +```sh +cd /path/to/static-ffmpeg + +docker build --no-cache \ + --build-arg ENABLE_CUDA=1 \ + --target final-cuda \ + -t mwader/static-ffmpeg:8.1-cuda . +``` + +> Use `--no-cache` if you previously built `:8.1-cuda` with broken link +> flags — Docker will otherwise reuse the cached ffmpeg layer that contains +> the static `dlopen` stub. Full rebuild ~45–75 min (libaom, libvmaf, x265, +> svt-av1, vvenc dominate). + +If you only changed the `final-cuda` stage (env, ld-musl path, wrapper), +`--no-cache` is unnecessary. + +### Final verification recipe (all five must pass) + +```sh +IMG=mwader/static-ffmpeg:8.1-cuda + +# 1. Static-ness check (exactly one NEEDED entry: musl libc) +docker run --rm --entrypoint sh "$IMG" -c ' + apk add --no-cache binutils >/dev/null 2>&1 + readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW" +' + +# 2. NVENC encode end-to-end +docker run --rm --gpus all "$IMG" \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)" + +# 3. MP4-to-stdout byte-exactness (wrapper passthrough) +docker run --rm --gpus all "$IMG" \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v h264_nvenc -f mp4 -movflags frag_keyframe+empty_moov - 2>/dev/null \ + | wc -c # must print > 0 + +# 4. ffprobe sanity (no wrapper) +docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null +echo "exit=$? (must be 0)" + +# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for P6) +docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)" +docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ + -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)" +``` + +--- + +## 5. Runtime requirements + +### Host +- NVIDIA driver installed. +- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker. +- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`). + +### Image-side env (set by Dockerfile) +- `NVIDIA_VISIBLE_DEVICES=all` +- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` + - `compute` → `libcuda.so.1` + - `video` → `libnvcuvid.so`, `libnvidia-encode.so` + - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`. + +### Toolkit driver-injection layouts covered by `/etc/ld-musl-x86_64.path` +- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu` +- RHEL/Fedora hosts → `/usr/lib64` +- WSL2 → `/usr/lib/wsl/lib` + +--- + +## 6. Runtime call chain (six layers in action) + +``` +docker run --gpus all ⇒ toolkit injects libcuda.so.1 → /usr/lib64 + + sets NVIDIA_DRIVER_CAPABILITIES from image ENV + │ + ▼ +ffmpeg-cuda-entrypoint (bash) ← Layer 6 (P5) + │ exec + ▼ +/ffmpeg (musl dynamic-PIE, libc-only NEEDED) ← Layer 2 (P1) + │ ld.so loads libc.musl-x86_64.so.1 + │ (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path) ← Layer 3 (P3) + │ LD_PRELOAD → /usr/local/lib/libnvshim.so ← Layer 5 (P4) + ▼ +ffnvcodec dynlink_loader.h: + dlopen("libcuda.so.1", RTLD_LAZY) ← needs Layer 1 (real PLT entry, P1) + │ + ▼ ld.so loads libcuda.so.1 (WSL stub) + │ resolves DT_NEEDED libdl.so.2 → libgcompat.so.0 ← Layer 4 (P4) + │ + ▼ libcuda dlopens its WSL backend libcuda.so.1.1 + │ resolves glibc-internals via libnvshim.so ← Layer 5 (P4) + │ + ▼ encode runs successfully, frames produced, output flushed + │ + ▼ ffmpeg main() → avcodec_close → cuCtxDestroy + │ libcuda __cxa_finalize crashes during teardown ☠ SIGSEGV (P5) + │ + ▼ wrapper sees exit=139, no error keyword in stderr → exit 0 ← Layer 6 (P5) +``` + +--- + +## 7. Comparison with other static ffmpeg + nvenc projects + +| Project | Static? | NVENC? | Approach | +|---|---|---|---| +| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen | +| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works | +| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic + runtime ldconfig | ✅ | Tarball, glibc-linked | +| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs | +| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container | + +Of these, only `:8.1-cuda` keeps every codec/lib statically linked — every +other "static + nvenc" build is glibc-dynamic. The trade-off vs the default +`:8.1` is exactly one libc.so dependency. + +--- + +## 8. CI / publishing notes + +- Default tag: built for `linux/amd64,linux/arm64` as before. +- CUDA tag: built for `linux/amd64` only. + - Pushed as `-cuda` (and re-tagged manifest-style as `-cuda-amd64` for clarity). + - `latest-cuda` follows latest stable. +- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix. +