diff --git a/.github/build.bat b/.github/build.bat
index e4991a1a..d48f0957 100755
--- a/.github/build.bat
+++ b/.github/build.bat
@@ -38,15 +38,28 @@ if /I "%USE_CACHE%"=="true" (
     )
 )
 
+REM NOTE: nvcc is NOT wrapped with sccache on Windows. Unlike build.sh (Linux) -- where
+REM sccache caches the per-arch .cu device passes -- sccache on Windows cannot parse the
+REM nvcc command line (it dies with `sccache: error: Could not parse shell line` and
+REM fails every .cu compile). So CUDA device code is built by nvcc directly (uncached)
+REM here; the cl.exe C/C++ TUs still cache via the C/CXX launcher set above.
+
 mkdir build
 cmake -Bbuild %LAUNCH% %*
-if errorlevel 1 exit /b %ERRORLEVEL%
+if errorlevel 1 exit /b 1
 cmake --build build --config Release
-if errorlevel 1 exit /b %ERRORLEVEL%
+set "BUILD_RC=!ERRORLEVEL!"
 
-REM Only query stats when sccache was actually wired in as the launcher; re-invoking
-REM a rejected/crashing sccache here would just repeat its failure output.
+REM Print cache stats (best-effort) regardless of build outcome -- only when sccache
+REM was wired in as the launcher.
 if defined LAUNCH (
     echo build.bat: sccache --show-stats
     sccache --show-stats
 )
+
+REM Propagate a build failure as a non-zero exit (a prior bug let a failed `cmake
+REM --build` reach here and exit 0, masquerading as a green build with no artifacts).
+if not "!BUILD_RC!"=="0" (
+    echo build.bat: cmake --build failed with exit code !BUILD_RC!.
+    exit /b !BUILD_RC!
+)
diff --git a/.github/build_opencl_android.sh b/.github/build_opencl_android.sh
index efa3789c..491a59b5 100755
--- a/.github/build_opencl_android.sh
+++ b/.github/build_opencl_android.sh
@@ -20,9 +20,9 @@ HEADERS_DIR="$OPENCL_STAGE/OpenCL-Headers"
 LOADER_DIR="$OPENCL_STAGE/OpenCL-ICD-Loader"
 LOADER_BUILD="$LOADER_DIR/build"
 
-# Pinned tags for reproducibility.
-HEADERS_TAG=v2025.07.22
-LOADER_TAG=v2025.07.22
+# Pinned tags for reproducibility (OpenCL 3.1.1 spec release).
+HEADERS_TAG=v2026.05.29
+LOADER_TAG=v2026.05.29
 
 if [ ! -d "$HEADERS_DIR" ]; then
     mkdir -p "$OPENCL_STAGE"
diff --git a/.github/build_opencl_windows.bat b/.github/build_opencl_windows.bat
new file mode 100644
index 00000000..dbc9c5b4
--- /dev/null
+++ b/.github/build_opencl_windows.bat
@@ -0,0 +1,54 @@
+REM SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+REM
+REM SPDX-License-Identifier: MIT
+REM
+REM Windows x86_64 build with the OpenCL backend enabled, shipped as the
+REM `opencl-windows-x86-64` classifier. The windows-2025 runner image ships
+REM neither OpenCL headers nor an OpenCL import library, so this script first
+REM stages Khronos OpenCL-Headers and builds OpenCL-ICD-Loader (producing
+REM OpenCL.lib) before delegating the jllama configure+build to build.bat with
+REM the OpenCL paths. Mirrors build_opencl_android.sh.
+REM
+REM At runtime the GPU vendor's ICD (System32\OpenCL.dll, installed by the
+REM NVIDIA/AMD/Intel driver) provides the actual OpenCL symbols; we link only
+REM against the loader's import library, so no OpenCL.dll is shipped.
+
+@echo off
+setlocal enabledelayedexpansion
+
+set "OPENCL_STAGE=%RUNNER_TEMP%\opencl-stage"
+if "%RUNNER_TEMP%"=="" set "OPENCL_STAGE=%TEMP%\opencl-stage"
+set "HEADERS_DIR=%OPENCL_STAGE%\OpenCL-Headers"
+set "LOADER_DIR=%OPENCL_STAGE%\OpenCL-ICD-Loader"
+set "LOADER_BUILD=%LOADER_DIR%\build"
+
+REM Pinned tags for reproducibility (OpenCL 3.1.1; match build_opencl_android.sh).
+set "HEADERS_TAG=v2026.05.29"
+set "LOADER_TAG=v2026.05.29"
+
+if not exist "%HEADERS_DIR%" (
+    git clone --depth 1 --branch %HEADERS_TAG% https://github.com/KhronosGroup/OpenCL-Headers.git "%HEADERS_DIR%"
+    if errorlevel 1 exit /b 1
+)
+
+if not exist "%LOADER_BUILD%\Release\OpenCL.lib" if not exist "%LOADER_BUILD%\OpenCL.lib" (
+    if not exist "%LOADER_DIR%" (
+        git clone --depth 1 --branch %LOADER_TAG% https://github.com/KhronosGroup/OpenCL-ICD-Loader.git "%LOADER_DIR%"
+        if errorlevel 1 exit /b 1
+    )
+    cmake -B "%LOADER_BUILD%" -S "%LOADER_DIR%" -DOPENCL_ICD_LOADER_HEADERS_DIR="%HEADERS_DIR%" -DBUILD_TESTING=OFF
+    if errorlevel 1 exit /b 1
+    cmake --build "%LOADER_BUILD%" --config Release
+    if errorlevel 1 exit /b 1
+)
+
+REM Resolve the import library: multi-config generators emit build\Release\OpenCL.lib,
+REM single-config ones emit build\OpenCL.lib.
+set "OPENCL_LIB=%LOADER_BUILD%\Release\OpenCL.lib"
+if not exist "%OPENCL_LIB%" set "OPENCL_LIB=%LOADER_BUILD%\OpenCL.lib"
+
+REM Delegate to build.bat so the jllama build inherits the sccache probe + Depot
+REM cache launcher and --show-stats output. The OpenCL paths satisfy ggml's
+REM find_package(OpenCL); the caller appends -G/-DGGML_OPENCL/-DOS_* via %*.
+call .github\build.bat -DOpenCL_INCLUDE_DIR="%HEADERS_DIR%" -DOpenCL_LIBRARY="%OPENCL_LIB%" %*
+exit /b %ERRORLEVEL%
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c2d82f36..eefed904 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -476,8 +476,8 @@ jobs:
           name: macos-14-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
 
-  build-windows-x86_64:
-    name: Build and Test Windows 2025 x86_64 (VS 2026)
+  build-windows-x86_64-msvc:
+    name: Build and Test Windows 2025 x86_64 (MSVC / VS 2026, classifier)
     needs: [startgate, build-webui]
     runs-on: windows-2025-vs2026
     steps:
@@ -507,11 +507,11 @@ jobs:
       - name: Upload artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: Windows-x86_64-libraries
+          name: Windows-x86_64-msvc
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
 
-  build-windows-x86:
-    name: Build and Test Windows 2025 x86 (VS 2026)
+  build-windows-x86-msvc:
+    name: Build and Test Windows 2025 x86 (MSVC / VS 2026, classifier)
     needs: [startgate, build-webui]
     runs-on: windows-2025-vs2026
     steps:
@@ -541,25 +541,25 @@ jobs:
       - name: Upload artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: Windows-x86-libraries
+          name: Windows-x86-msvc
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
 
   # ---------------------------------------------------------------------------
-  # Windows Ninja Multi-Config + sccache — EVALUATION jobs (not yet released).
-  # The Visual Studio generator ignores CMAKE_{C,CXX}_COMPILER_LAUNCHER, so the two
-  # build-windows-* jobs above are the only uncached native builds. Upstream
-  # llama.cpp ships its windows-cuda artifact with "Ninja Multi-Config" + MSVC,
-  # which proves the combination works on the same tree. These two jobs run that
-  # combination in parallel with the trusted VS jobs and front cl.exe with sccache
-  # over Depot WebDAV (build.bat probe-guards it). Artifacts are named
-  # `Windows-*-ninja` (NOT `*-libraries`) so the package job's `pattern: "*-libraries"`
-  # does NOT pick them up — they are evaluation-only until cache hits are confirmed,
-  # at which point the release path is switched over (see TODO.md). The package job's
-  # `needs:` is intentionally left unchanged.
+  # Windows Ninja Multi-Config + sccache — the DEFAULT Windows CPU natives.
+  # The Visual Studio generator ignores CMAKE_{C,CXX}_COMPILER_LAUNCHER, so only the
+  # Ninja Multi-Config generator can front cl.exe with sccache over Depot WebDAV
+  # (build.bat probe-guards it). Both generators use the same MSVC toolchain (cl.exe,
+  # static /MT CRT) on the same runner, so the produced jllama.dll/llama.dll/ggml.dll
+  # are functionally equivalent with identical runtime dependencies — the only delta
+  # is build-system plumbing + caching. The Ninja build is therefore the default JAR
+  # (artifacts `Windows-*-libraries`, picked up by the package job's `pattern:
+  # "*-libraries"`); the MSVC build above is shipped as the `msvc-windows` classifier
+  # for anyone who wants the Visual-Studio-generator natives. Upstream llama.cpp also
+  # builds its Windows artifacts with Ninja Multi-Config + MSVC.
   # ---------------------------------------------------------------------------
 
-  build-windows-x86_64-ninja:
-    name: Build and Test Windows 2025 x86_64 (Ninja Multi-Config + sccache, eval)
+  build-windows-x86_64:
+    name: Build and Test Windows 2025 x86_64 (Ninja Multi-Config + sccache, default)
     needs: [startgate, build-webui]
     runs-on: windows-2025-vs2026
     env:
@@ -609,11 +609,11 @@ jobs:
       - name: Upload artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: Windows-x86_64-ninja
+          name: Windows-x86_64-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
 
-  build-windows-x86-ninja:
-    name: Build and Test Windows 2025 x86 (Ninja Multi-Config + sccache, eval)
+  build-windows-x86:
+    name: Build and Test Windows 2025 x86 (Ninja Multi-Config + sccache, default)
     needs: [startgate, build-webui]
     runs-on: windows-2025-vs2026
     env:
@@ -663,9 +663,172 @@ jobs:
       - name: Upload artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: Windows-x86-ninja
+          name: Windows-x86-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
 
+  # ---------------------------------------------------------------------------
+  # Windows GPU classifiers (x86_64 only) — CUDA, Vulkan, OpenCL.
+  # All three use the same Ninja Multi-Config + MSVC + sccache toolchain as the
+  # default CPU build; they differ only by the GGML backend flag (and the build-time
+  # SDK each needs). CMakeLists.txt routes each backend's output to its own
+  # src/main/resources_windows_{cuda,vulkan,opencl}/ tree, which the matching Maven
+  # profile (cuda-windows / vulkan-windows / opencl-windows) turns into a classifier
+  # JAR. GPU runtime libraries are NOT bundled — the consumer's GPU driver / toolkit
+  # provides them (CUDA: cudart64_13/cublas64_13 from the CUDA Toolkit; Vulkan:
+  # vulkan-1.dll from the driver; OpenCL: System32\OpenCL.dll from the driver).
+  # NOTE: GitHub-hosted Windows runners have NO GPU, so these jobs build + run the
+  # C++ unit suite (ctest, CPU-only) but cannot run model-backed GPU inference;
+  # end-to-end GPU validation is local / self-hosted.
+  # ---------------------------------------------------------------------------
+
+  build-windows-x86_64-cuda:
+    name: Build Windows 2025 x86_64 CUDA (Ninja + sccache)
+    needs: [startgate, build-webui]
+    runs-on: windows-2025-vs2026
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install CUDA Toolkit
+        # Full toolkit install (default method: local, no sub-packages restriction).
+        # A reduced network sub-package set ("nvcc","cudart","cublas",…) omitted the
+        # nvcc crt headers (crt/host_config.h), so cmake's CUDA compiler detection
+        # failed at configure. The full installer ships every header reliably.
+        uses: Jimver/cuda-toolkit@v0.2.35
+        id: cuda-toolkit
+        with:
+          cuda: '13.2.0'
+      - name: Install sccache (shared compiler cache)
+        if: env.USE_CACHE == 'true' && env.SCCACHE_WEBDAV_TOKEN != ''
+        continue-on-error: true
+        shell: pwsh
+        run: |
+          $ver = "0.16.0"
+          $rel = "sccache-v$ver-x86_64-pc-windows-msvc"
+          $url = "https://github.com/mozilla/sccache/releases/download/v$ver/$rel.zip"
+          Write-Host "Downloading $url"
+          Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\sccache.zip"
+          Expand-Archive -Path "$env:RUNNER_TEMP\sccache.zip" -DestinationPath "$env:RUNNER_TEMP\sccache" -Force
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\sccache\$rel"
+      - name: Build libraries
+        shell: cmd
+        # GPU jobs build the artifact only — no -DBUILD_TESTING / ctest. The C++ unit
+        # suite is CPU-only and fully covered by the `C++ Tests` job + the CPU Windows
+        # jobs; a GPU-linked jllama_test.exe cannot be discovered/run on a GPU-less
+        # GitHub runner (it errors probing for a CUDA device -> ctest *_NOT_BUILT).
+        run: |
+          .github\build.bat -G "Ninja Multi-Config" -DGGML_CUDA=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-cuda
+          path: ${{ github.workspace }}/src/main/resources_windows_cuda/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-x86_64-vulkan:
+    name: Build Windows 2025 x86_64 Vulkan (Ninja + sccache)
+    needs: [startgate, build-webui]
+    runs-on: windows-2025-vs2026
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install Vulkan SDK
+        uses: jakoch/install-vulkan-sdk-action@v1.6.0
+        with:
+          vulkan_version: 1.4.350.0
+          cache: true
+      - name: Install sccache (shared compiler cache)
+        if: env.USE_CACHE == 'true' && env.SCCACHE_WEBDAV_TOKEN != ''
+        continue-on-error: true
+        shell: pwsh
+        run: |
+          $ver = "0.16.0"
+          $rel = "sccache-v$ver-x86_64-pc-windows-msvc"
+          $url = "https://github.com/mozilla/sccache/releases/download/v$ver/$rel.zip"
+          Write-Host "Downloading $url"
+          Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\sccache.zip"
+          Expand-Archive -Path "$env:RUNNER_TEMP\sccache.zip" -DestinationPath "$env:RUNNER_TEMP\sccache" -Force
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\sccache\$rel"
+      - name: Build libraries
+        shell: cmd
+        # Build the artifact only (see the CUDA job's note: GPU-less runner can't run a
+        # GPU-linked jllama_test; the C++ unit suite is covered by the CPU jobs).
+        run: |
+          .github\build.bat -G "Ninja Multi-Config" -DGGML_VULKAN=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-vulkan
+          path: ${{ github.workspace }}/src/main/resources_windows_vulkan/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-x86_64-opencl:
+    name: Build Windows 2025 x86_64 OpenCL (Ninja + sccache)
+    needs: [startgate, build-webui]
+    runs-on: windows-2025-vs2026
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install sccache (shared compiler cache)
+        if: env.USE_CACHE == 'true' && env.SCCACHE_WEBDAV_TOKEN != ''
+        continue-on-error: true
+        shell: pwsh
+        run: |
+          $ver = "0.16.0"
+          $rel = "sccache-v$ver-x86_64-pc-windows-msvc"
+          $url = "https://github.com/mozilla/sccache/releases/download/v$ver/$rel.zip"
+          Write-Host "Downloading $url"
+          Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\sccache.zip"
+          Expand-Archive -Path "$env:RUNNER_TEMP\sccache.zip" -DestinationPath "$env:RUNNER_TEMP\sccache" -Force
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\sccache\$rel"
+      - name: Build libraries
+        shell: cmd
+        # Build the artifact only (see the CUDA job's note: GPU-less runner can't run a
+        # GPU-linked jllama_test; the C++ unit suite is covered by the CPU jobs).
+        run: |
+          .github\build_opencl_windows.bat -G "Ninja Multi-Config" -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-opencl
+          path: ${{ github.workspace }}/src/main/resources_windows_opencl/net/ladenthin/llama/
+          if-no-files-found: error
+
   # ---------------------------------------------------------------------------
   # CI-only jobs — no release artifact, purely for test coverage
   # ---------------------------------------------------------------------------
@@ -1154,7 +1317,7 @@ jobs:
           if-no-files-found: warn
 
   test-java-windows-x86_64:
-    name: Java Tests Windows 2025 x86_64 (VS 2026)
+    name: Java Tests Windows 2025 x86_64 (default / Ninja)
     needs: build-windows-x86_64
     runs-on: windows-2025-vs2026
     steps:
@@ -1258,13 +1421,13 @@ jobs:
             ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/**/*
           if-no-files-found: warn
 
-  # Java/inference validation of the Ninja-built x86_64 DLL (the analogue of
-  # test-java-windows-x86_64 for the MSVC build). Loads the Ninja jllama.dll via
-  # JNI and runs the full model-backed suite, so both Windows generators are
-  # validated end-to-end before the `ninja-windows` classifier JAR ships.
-  test-java-windows-x86_64-ninja:
-    name: Java Tests Windows 2025 x86_64 (Ninja, eval)
-    needs: build-windows-x86_64-ninja
+  # Java/inference validation of the MSVC-built x86_64 DLL (the analogue of
+  # test-java-windows-x86_64 for the default Ninja build). Loads the MSVC jllama.dll
+  # via JNI and runs the full model-backed suite, so both Windows generators are
+  # validated end-to-end before the `msvc-windows` classifier JAR ships.
+  test-java-windows-x86_64-msvc:
+    name: Java Tests Windows 2025 x86_64 (MSVC classifier)
+    needs: build-windows-x86_64-msvc
     runs-on: windows-2025-vs2026
     steps:
       - uses: actions/checkout@v7
@@ -1281,7 +1444,7 @@ jobs:
           Get-ComputerInfo -Property "CsProcessors*" 2>$null || Write-Host "Get-ComputerInfo not available"
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86_64-ninja
+          name: Windows-x86_64-msvc
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
       - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace)
         uses: actions/cache@v5
@@ -1355,7 +1518,7 @@ jobs:
       - if: failure()
         uses: actions/upload-artifact@v7
         with:
-          name: windows-output-ninja
+          name: windows-output-msvc
           path: |
             ${{ github.workspace }}\hs_err_pid*.log
             ${{ github.workspace }}\*.hprof
@@ -1378,9 +1541,13 @@ jobs:
       - crosscompile-linux-aarch64
       - crosscompile-android-aarch64
       - crosscompile-android-aarch64-opencl
+      - build-windows-x86_64
       - build-windows-x86
-      - build-windows-x86_64-ninja
-      - build-windows-x86-ninja
+      - build-windows-x86_64-msvc
+      - build-windows-x86-msvc
+      - build-windows-x86_64-cuda
+      - build-windows-x86_64-vulkan
+      - build-windows-x86_64-opencl
       - test-cpp-linux-x86_64
       - build-macos-arm64-metal-15
       - test-java-linux-x86_64
@@ -1388,7 +1555,7 @@ jobs:
       - test-java-macos-arm64-no-metal
       - test-java-macos-arm64-metal-15
       - test-java-windows-x86_64
-      - test-java-windows-x86_64-ninja
+      - test-java-windows-x86_64-msvc
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v7
@@ -1405,17 +1572,29 @@ jobs:
         with:
           name: android-libraries-opencl
           path: ${{ github.workspace }}/src/main/resources_android_opencl/net/ladenthin/llama/
-      # Ninja-built Windows natives -> separate tree consumed by the `windows-ninja`
-      # Maven profile (the `ninja-windows` classifier JAR). The default JAR keeps the
-      # MSVC `*-libraries` natives downloaded above.
+      # MSVC-built Windows natives -> `msvc-windows` classifier tree. The default JAR
+      # now ships the Ninja `*-libraries` natives merged above (default flip).
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86_64-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86_64-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
+      # Windows GPU classifiers (x86_64 only) -> one tree each.
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-cuda
+          path: ${{ github.workspace }}/src/main/resources_windows_cuda/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-vulkan
+          path: ${{ github.workspace }}/src/main/resources_windows_vulkan/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-opencl
+          path: ${{ github.workspace }}/src/main/resources_windows_opencl/net/ladenthin/llama/
       - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
@@ -1426,8 +1605,10 @@ jobs:
         # default-platform native libs in one drop-on-classpath JAR, runnable via its
         # OpenAiCompatServer Main-Class). It lands in target/ and is uploaded in the `llama-jars`
         # artifact below - a CI run artifact only, not a Maven Central / GitHub-Release asset.
-        # `windows-ninja` attaches the `ninja-windows` classifier JAR (Ninja-built Windows natives).
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-ninja,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
+        # Windows classifier JARs: `windows-msvc` (MSVC-built CPU natives) plus the GPU
+        # backends `cuda-windows` / `vulkan-windows` / `opencl-windows`. The default JAR's
+        # Windows natives are the Ninja `*-libraries` merged into src/main/resources/ above.
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
       - name: Upload JARs
         uses: actions/upload-artifact@v7
         with:
@@ -1507,12 +1688,24 @@ jobs:
           path: ${{ github.workspace }}/src/main/resources_android_opencl/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86_64-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86_64-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-cuda
+          path: ${{ github.workspace }}/src/main/resources_windows_cuda/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86_64-vulkan
+          path: ${{ github.workspace }}/src/main/resources_windows_vulkan/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-opencl
+          path: ${{ github.workspace }}/src/main/resources_windows_opencl/net/ladenthin/llama/
       - name: Set up Maven Central Repository
         uses: actions/setup-java@v5
         with:
@@ -1533,7 +1726,7 @@ jobs:
             *) echo "::error::Refusing to publish non-SNAPSHOT version '$VERSION' from the snapshot job. Snapshot publishing requires a -SNAPSHOT version; releases go through the v* tag path."; exit 1 ;;
           esac
       - name: Publish snapshot
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-ninja -Dmaven.test.skip=true deploy
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy
         env:
           MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }}
           MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }}
@@ -1599,12 +1792,24 @@ jobs:
           path: ${{ github.workspace }}/src/main/resources_android_opencl/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86_64-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86_64-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86-msvc
+          path: ${{ github.workspace }}/src/main/resources_windows_msvc/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-cuda
+          path: ${{ github.workspace }}/src/main/resources_windows_cuda/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-vulkan
+          path: ${{ github.workspace }}/src/main/resources_windows_vulkan/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
-          name: Windows-x86-ninja
-          path: ${{ github.workspace }}/src/main/resources_windows_ninja/net/ladenthin/llama/
+          name: Windows-x86_64-opencl
+          path: ${{ github.workspace }}/src/main/resources_windows_opencl/net/ladenthin/llama/
       - name: Set up Maven Central Repository
         uses: actions/setup-java@v5
         with:
@@ -1616,7 +1821,7 @@ jobs:
           gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
           gpg-passphrase: MAVEN_GPG_PASSPHRASE
       - name: Publish release
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-ninja -Dmaven.test.skip=true deploy
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy
         env:
           MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }}
           MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 8aabd814..be02ca4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,7 +40,12 @@ replay_pid*
 models/*.gguf
 src/main/cpp/net_ladenthin_llama_*.h
 src/main/resources_cuda_linux/
-src/main/resources_windows_ninja/
+# Per-classifier native trees, staged by CI before the matching Maven profile runs,
+# never committed (same policy as the default-tree native libs below).
+src/main/resources_windows_msvc/
+src/main/resources_windows_cuda/
+src/main/resources_windows_vulkan/
+src/main/resources_windows_opencl/
 src/main/resources/**/*.so
 src/main/resources/**/*.dylib
 src/main/resources/**/*.dll
diff --git a/CLAUDE.md b/CLAUDE.md
index 1c2094d8..03743d91 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -163,49 +163,94 @@ At runtime the device must provide its own OpenCL ICD (`libOpenCL.so`);
 Qualcomm Adreno drivers do. Devices without an ICD should use the default
 CPU-only Android JAR.
 
-## Windows Ninja artifact (sccache-cached, parallel to the MSVC build)
-
-The Visual Studio generator ignores `CMAKE_{C,CXX}_COMPILER_LAUNCHER`, so the two MSVC Windows
-jobs (`build-windows-x86_64`, `build-windows-x86`) **cannot** use the sccache/Depot cache. Rather
-than switch the trusted MSVC build, the repo builds the **same CPU natives a second time** with the
-**`Ninja Multi-Config`** generator (which *does* honor the launcher) and ships them as a separate
-**`ninja-windows`** Maven classifier JAR. **The MSVC build is the default JAR and is kept
-permanently** — the Ninja artifact is an additional, cache-accelerated, independently
-end-to-end-tested option, not a replacement. (Upstream llama.cpp ships its `windows-cuda` artifact
-with Ninja Multi-Config + MSVC, proving the combination works on the same tree.)
-
-Unlike the CUDA / OpenCL classifiers — which differ by a **GGML backend flag** and route their
-output in `CMakeLists.txt` — the Ninja Windows build differs only by **generator/toolchain**, so
-there is **no `CMakeLists.txt` change**: both generators emit to the canonical
-`src/main/resources/.../Windows/{x86_64,x86}/`. Routing to the classifier tree happens purely at the
-CI-download + pom-profile level. Four places wire it together:
-
-1. **`.github/build.bat`** — sccache probe guard mirroring `build.sh`'s `sccache_can_wrap_compiler()`:
-   when `USE_CACHE=true` and `sccache` is on PATH, it compiles a trivial TU through `sccache cl.exe`;
-   only on success does it pass `-DCMAKE_{C,CXX}_COMPILER_LAUNCHER=sccache` and print
-   `sccache --show-stats`. A missing/crashing sccache falls back to a green uncached build. The MSVC
-   jobs do not set `USE_CACHE`, so the guard is inert for them.
-2. **`.github/workflows/publish.yml`** — build jobs `build-windows-x86_64-ninja` /
-   `build-windows-x86-ninja` (`windows-2025-vs2026`, `ilammy/msvc-dev-cmd@v1` for the arch env,
-   sccache v0.16.0 from the GitHub release **zip** + Depot WebDAV, `build.bat -G "Ninja Multi-Config"`),
-   uploading artifacts `Windows-{x86_64,x86}-ninja` (**not** `*-libraries`, so the `package` job's
-   `pattern: "*-libraries"` ignores them). `test-java-windows-x86_64-ninja` loads the Ninja DLL via
-   JNI and runs the full model-backed suite. The `package`, `publish-snapshot`, and `publish-release`
-   jobs download `Windows-*-ninja` into `src/main/resources_windows_ninja/` and activate the
-   `windows-ninja` Maven profile.
-3. **`pom.xml`** — the `windows-ninja` profile produces a second JAR with `<classifier>ninja-windows</classifier>`
-   from the `${project.build.outputDirectory}_windows_ninja` tree (separate compile pass + resource
-   copy + classified jar; mirrors the `cuda` / `opencl-android` profiles). Activated only in CI.
-4. **`README.md`** — the `ninja-windows` row + dependency snippet in "Choosing the right classifier".
-
-`src/main/resources_windows_ninja/` is git-ignored (staged by CI, never committed — same policy as
-the native libs and the CUDA/OpenCL trees).
-
-**Local sanity build** (needs MSVC + a Ninja on PATH; sccache optional):
+## Windows native classifiers (default Ninja CPU + MSVC classifier + CUDA/Vulkan/OpenCL GPU)
+
+The Windows native libraries ship in **five** forms. The **default JAR's** Windows natives are now
+built with the **`Ninja Multi-Config`** generator (the *default flip*); the Visual Studio / MSVC
+build is shipped as the **`msvc-windows`** classifier; and three GPU backends ship as
+**`cuda13-windows-x86-64`**, **`vulkan-windows-x86-64`**, and **`opencl-windows-x86-64`** (all
+**x86_64 only**, all Ninja).
+
+**Why Ninja is the default (the flip).** The Visual Studio generator ignores
+`CMAKE_{C,CXX}_COMPILER_LAUNCHER`, so only Ninja Multi-Config can front `cl.exe` with sccache over
+Depot WebDAV. **Both generators use the same MSVC toolchain** (`cl.exe`, static `/MT` CRT via
+`CMAKE_MSVC_RUNTIME_LIBRARY`, same Release flags, same runner), so the produced
+`jllama.dll`/`llama.dll`/`ggml.dll` are **functionally equivalent with identical runtime
+dependencies** — the only difference is build-system plumbing + caching. Making Ninja the default
+gives the most-pulled JAR the sccache cache; MSVC stays available as a classifier for anyone who
+wants the Visual-Studio-generator build. (Upstream llama.cpp also builds its Windows artifacts with
+Ninja Multi-Config + MSVC.) Both Windows CPU builds are validated end-to-end with the full
+model-backed Java suite (`test-java-windows-x86_64` = default/Ninja, `test-java-windows-x86_64-msvc`
+= MSVC classifier).
+
+**GPU runtime libraries are NOT bundled.** The GPU JARs ship only `jllama.dll`/`llama.dll`/`ggml.dll`
+(plus the embedded backend). The consumer's driver/toolkit must supply the runtime: CUDA needs the
+installed CUDA 13 Toolkit (`cudart64_13.dll`/`cublas64_13.dll`/`cublasLt64_13.dll` on `PATH`); Vulkan
+needs `vulkan-1.dll` (ships with current GPU drivers); OpenCL needs the vendor ICD
+(`System32\OpenCL.dll`). Not bundling = no NVIDIA-EULA redistribution obligation. **GitHub-hosted
+Windows runners have NO GPU**, so the GPU jobs **build the artifact only** (no `-DBUILD_TESTING`/`ctest`)
+— a GPU-linked `jllama_test.exe` can't even be enumerated on a GPU-less runner (it errors probing for a
+device, so `gtest_discover_tests` registers a failing `*_NOT_BUILT` sentinel). The CPU-only C++ unit
+suite is fully covered by the `C++ Tests` job + the CPU Windows jobs; model-backed GPU inference is
+local / self-hosted.
+
+Wiring (mirrors the CUDA-Linux / OpenCL-Android classifier pattern):
+
+1. **`CMakeLists.txt`** — the `if(GGML_CUDA) … elseif(GGML_VULKAN) … elseif(GGML_OPENCL) … else()`
+   chain is **OS-aware**: CUDA → `resources_windows_cuda` on Windows (else `resources_linux_cuda`),
+   Vulkan → `resources_windows_vulkan`, OpenCL → `resources_windows_opencl` on Windows (else
+   `resources_android_opencl`). The default CPU build (both generators) still emits to the canonical
+   `src/main/resources/.../Windows/{x86_64,x86}/`, so the Ninja-vs-MSVC split is purely a
+   CI-artifact-name + pom-profile concern (no CMake change for it).
+2. **`.github/build.bat`** — the sccache probe guard (mirrors `build.sh`) wraps the **cl.exe** C/C++ TUs
+   only. Unlike `build.sh` (Linux), it does **not** wrap `nvcc`: sccache on Windows can't parse the nvcc
+   command line (`sccache: error: Could not parse shell line`) and fails every `.cu` compile, so CUDA
+   device code builds with nvcc directly (uncached). `build.bat` also propagates a `cmake --build`
+   failure as a non-zero exit (a prior bug let a failed CUDA build exit 0 → empty artifact → late
+   `package` failure); the GPU upload steps additionally use `if-no-files-found: error` as a backstop.
+3. **`.github/build_opencl_windows.bat`** — stages Khronos OpenCL-Headers + builds OpenCL-ICD-Loader
+   (`OpenCL.lib`), then delegates to `build.bat` with `-DOpenCL_INCLUDE_DIR`/`-DOpenCL_LIBRARY`
+   (the Windows analogue of `build_opencl_android.sh`).
+4. **`.github/workflows/publish.yml`** — build jobs (all `windows-2025-vs2026`, `ilammy/msvc-dev-cmd@v1`,
+   sccache v0.16.0 zip + Depot WebDAV):
+   - `build-windows-x86_64` / `build-windows-x86` — **Ninja CPU**, artifacts `Windows-{arch}-libraries`
+     → picked up by the `package` job's `pattern: "*-libraries"` into the **default** tree.
+   - `build-windows-x86_64-msvc` / `build-windows-x86-msvc` — **MSVC CPU**, artifacts `Windows-{arch}-msvc`.
+   - `build-windows-x86_64-cuda` — `Jimver/cuda-toolkit@v0.2.35` (CUDA `13.2.0`) + `-DGGML_CUDA=ON`,
+     artifact `Windows-x86_64-cuda`.
+   - `build-windows-x86_64-vulkan` — `jakoch/install-vulkan-sdk-action` + `-DGGML_VULKAN=ON`, artifact
+     `Windows-x86_64-vulkan`.
+   - `build-windows-x86_64-opencl` — `build_opencl_windows.bat -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON`,
+     artifact `Windows-x86_64-opencl`.
+   The `package`, `publish-snapshot`, and `publish-release` jobs download each non-default artifact into
+   its `src/main/resources_windows_{msvc,cuda,vulkan,opencl}/` tree and activate the
+   `windows-msvc,cuda-windows,vulkan-windows,opencl-windows` Maven profiles.
+5. **`pom.xml`** — profiles `windows-msvc` / `cuda-windows` / `vulkan-windows` / `opencl-windows`,
+   each a separate compile pass + resource copy + classified jar (classifiers `msvc-windows` /
+   `cuda13-windows-x86-64` / `vulkan-windows-x86-64` / `opencl-windows-x86-64`). Activated only in CI.
+6. **`README.md`** — the classifier table + dependency snippets in "Choosing the right classifier".
+
+`src/main/resources_windows_{msvc,cuda,vulkan,opencl}/` are git-ignored (staged by CI, never committed).
+
+**First CI run (PR #276, run 28327740376):** the default Ninja CPU flip, the MSVC classifier, and the
+**OpenCL** job were green on the first try. Two GPU jobs needed a toolchain fix: **CUDA** failed with
+`Version not available: 13.0.0` because the pinned `Jimver/cuda-toolkit@v0.2.24` predated CUDA 13.x →
+bumped to `@v0.2.35` + `13.2.0` (matches the Linux pin, classifier stays `cuda13-…`); **Vulkan** failed
+`find_package(Vulkan)` because `humbletim/install-vulkan-sdk` set `VULKAN_SDK` but laid the SDK out in a
+way CMake's `FindVulkan` couldn't read → switched to `jakoch/install-vulkan-sdk-action` (purpose-built,
+FindVulkan-compatible). Because all five Windows build jobs are in the `package`/publish `needs:` graph, a
+GPU-toolchain failure blocks packaging — the same release-gating policy the Linux-CUDA / Android-OpenCL
+jobs already follow.
+
+**Local sanity builds** (need MSVC + Ninja on PATH; sccache optional; GPU builds also need the matching SDK):
 ```bat
 mvn -q compile
 .github\build.bat -G "Ninja Multi-Config" -DOS_NAME=Windows -DOS_ARCH=x86_64 -DBUILD_TESTING=ON
 ctest --test-dir build --output-on-failure
+:: GPU (needs the matching SDK installed + on PATH):
+.github\build.bat -G "Ninja Multi-Config" -DGGML_CUDA=ON   -DOS_NAME=Windows -DOS_ARCH=x86_64
+.github\build.bat -G "Ninja Multi-Config" -DGGML_VULKAN=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
+.github\build_opencl_windows.bat -G "Ninja Multi-Config" -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
 ```
 
 ## WebUI (llama.cpp Svelte UI) embedding
@@ -348,9 +393,10 @@ dockcross wrapper only forwards host env it is explicitly told to via `-e`. The
 version is the `SCCACHE_DL_VERSION` knob in `build.sh` (default **0.16.0**; overridable per-job
 to try a different build against a container that crashed another). **Windows** is handled
 separately (the Visual Studio generator ignores `CMAKE_*_COMPILER_LAUNCHER`): see
-"Windows Ninja artifact" below — the cached path uses the **Ninja Multi-Config** generator with a
-`build.bat` sccache probe and a direct sccache zip download (not `mozilla-actions/sccache-action`),
-shipped as a parallel `ninja-windows` classifier JAR while the MSVC default stays the trusted build.
+"Windows native classifiers" below — the **default** Windows CPU JAR now uses the **Ninja
+Multi-Config** generator (so it caches) with a `build.bat` sccache probe and a direct sccache zip
+download (not `mozilla-actions/sccache-action`); the uncached MSVC build ships as the `msvc-windows`
+classifier, and the three Windows GPU classifiers (CUDA/Vulkan/OpenCL) use the same Ninja path.
 
 **Cross-repo scope.** This Depot/sccache compiler cache makes sense only for java-llama.cpp —
 it is the only sibling repo with a native (C++/JNI) build. It does not apply to the pure-Maven
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e3e7846..ebba2833 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -243,12 +243,33 @@ if(NOT OS_ARCH)
     message(FATAL_ERROR "Could not determine CPU architecture")
 endif()
 
+# Backend + OS routing for the native-library output tree. Each GPU backend ships
+# under its own Maven classifier, so it must land in a backend-specific resource
+# root (the default CPU tree stays src/main/resources/). The GPU branches are
+# OS-aware because the same GGML flag is used on more than one platform:
+#   - GGML_CUDA   -> Linux (resources_linux_cuda)   AND Windows (resources_windows_cuda)
+#   - GGML_OPENCL -> Android (resources_android_opencl) AND Windows (resources_windows_opencl)
+#   - GGML_VULKAN -> Windows only (resources_windows_vulkan)
+# The classifier->tree mapping is mirrored by the matching Maven profile in pom.xml.
 if(GGML_CUDA)
-    set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_cuda/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
-    message(STATUS "GPU (CUDA Linux) build - Installing files to ${JLLAMA_DIR}")
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_cuda/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (CUDA Windows) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_cuda/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (CUDA Linux) build - Installing files to ${JLLAMA_DIR}")
+    endif()
+elseif(GGML_VULKAN)
+    set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+    message(STATUS "GPU (Vulkan) build - Installing files to ${JLLAMA_DIR}")
 elseif(GGML_OPENCL)
-    set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_android_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
-    message(STATUS "GPU (OpenCL Android) build - Installing files to ${JLLAMA_DIR}")
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (OpenCL Windows) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_android_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (OpenCL Android) build - Installing files to ${JLLAMA_DIR}")
+    endif()
 else()
     set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
     message(STATUS "CPU build - Installing files to ${JLLAMA_DIR}")
diff --git a/README.md b/README.md
index 86b51236..a46d5f0e 100644
--- a/README.md
+++ b/README.md
@@ -162,17 +162,22 @@ If any of these match your platform, you can include the Maven dependency and ge
 
 ### Choosing the right classifier
 
-The Maven coordinate `net.ladenthin:llama` publishes one default JAR (CPU-only)
-plus optional JARs selected via a Maven `<classifier>`: two GPU/accelerator
-builds and one alternate-toolchain Windows build. Pick at most one GPU/accelerator
-classifier — those are mutually exclusive — and optionally the Windows build.
+The Maven coordinate `net.ladenthin:llama` publishes one default JAR (CPU-only;
+its Windows natives are built with the Ninja Multi-Config + MSVC toolchain) plus
+optional JARs selected via a Maven `<classifier>`: three Windows GPU builds
+(CUDA / Vulkan / OpenCL), the Linux CUDA and Android OpenCL builds, and an
+alternate-toolchain MSVC Windows CPU build. Pick at most one GPU/accelerator
+classifier — those are mutually exclusive — and optionally a CPU Windows build.
 
 | Classifier | Backend | Target platform | Runtime requirement |
 |---|---|---|---|
-| _(none)_ | CPU | Linux x86-64 / aarch64, macOS x86-64 / aarch64, Windows x86-64 (MSVC / Visual Studio generator), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). |
+| _(none)_ | CPU | Linux x86-64 / aarch64, macOS x86-64 / aarch64, Windows x86-64 / x86 (Ninja Multi-Config + MSVC), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). |
+| `msvc-windows` | CPU (MSVC / Visual Studio generator) | Windows x86-64 and x86 | None beyond a JDK 8+ JVM. Same CPU backend as the default JAR's Windows natives, but compiled with the Visual Studio generator instead of `Ninja Multi-Config`. Both use the same MSVC toolchain (static `/MT` CRT), so they are functionally equivalent — provided as an alternate-toolchain option. |
+| `cuda13-windows-x86-64` | CUDA 13 | Windows x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 Toolkit installed on the host (`cudart64_13.dll`, `cublas64_13.dll`, `cublasLt64_13.dll` resolvable on `PATH`). The runtime libraries are **not bundled** in the JAR; native-library load fails with `UnsatisfiedLinkError` if they are absent. No CPU fallback. |
+| `vulkan-windows-x86-64` | Vulkan | Windows x86-64 with a Vulkan 1.2+ GPU (NVIDIA / AMD / Intel) | A Vulkan runtime (`vulkan-1.dll`), which current GPU drivers install. No Vulkan SDK is needed at runtime. The most portable Windows GPU option (vendor-independent). |
+| `opencl-windows-x86-64` | OpenCL | Windows x86-64 with an OpenCL 2.0+ GPU | A vendor OpenCL ICD (`OpenCL.dll`, installed by the GPU driver). **Note:** the GGML OpenCL backend is Adreno-tuned; on desktop GPUs CUDA or Vulkan are better supported. |
 | `cuda13-linux-x86-64` | CUDA 13 | Linux x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 runtime libraries (`libcudart.so.13`, `libcublas.so.13`) installed on the host. The shared library is dynamically linked against them and will fail to `dlopen` if they are absent — there is no automatic fallback to CPU. |
 | `opencl-android-aarch64` | OpenCL (Adreno) | Android aarch64 with Qualcomm Adreno GPU | A device-supplied OpenCL ICD (`libOpenCL.so`). Devices without an ICD (e.g. most non-Snapdragon Android hardware) must use the default CPU JAR. |
-| `ninja-windows` | CPU (Ninja Multi-Config + MSVC) | Windows x86-64 and x86 | None beyond a JDK 8+ JVM. Same CPU backend as the default JAR's Windows natives, but compiled with the `Ninja Multi-Config` generator (sccache-cached in CI) instead of the Visual Studio generator. Provided so both Windows builds are available; functionally equivalent for normal use. |
 
 ```xml
 <!-- CPU (default) -->
@@ -198,23 +203,52 @@ classifier — those are mutually exclusive — and optionally the Windows build
     <classifier>opencl-android-aarch64</classifier>
 </dependency>
 
-<!-- Windows natives built with the Ninja Multi-Config generator (CPU) -->
+<!-- CUDA on Windows x86-64 (requires CUDA 13 Toolkit on the host) -->
 <dependency>
     <groupId>net.ladenthin</groupId>
     <artifactId>llama</artifactId>
     <version>5.0.2</version>
-    <classifier>ninja-windows</classifier>
+    <classifier>cuda13-windows-x86-64</classifier>
+</dependency>
+
+<!-- Vulkan on Windows x86-64 (NVIDIA/AMD/Intel; vulkan-1.dll from the driver) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.2</version>
+    <classifier>vulkan-windows-x86-64</classifier>
+</dependency>
+
+<!-- OpenCL on Windows x86-64 (requires a driver-provided OpenCL ICD) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.2</version>
+    <classifier>opencl-windows-x86-64</classifier>
+</dependency>
+
+<!-- Windows CPU natives built with the MSVC / Visual Studio generator -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.2</version>
+    <classifier>msvc-windows</classifier>
 </dependency>
 ```
 
 > [!IMPORTANT]
-> The CUDA JAR is **CUDA-only at runtime**. On a CPU-only host (no NVIDIA
-> driver or no CUDA 13 runtime libraries installed) the JVM will fail at
-> native-library load time with `UnsatisfiedLinkError` caused by an
-> underlying `dlopen` failure on `libcudart.so.13`. If you want to ship a
-> single artifact that works on both CPU and CUDA hosts, depend on the
-> default (CPU) JAR; users who want GPU acceleration must compile locally
-> with `-DGGML_CUDA=ON` (see [Setup required](#setup-required)).
+> The GPU JARs are **GPU-only at runtime**. On a host without the matching
+> GPU driver/runtime the JVM fails at native-library load time with
+> `UnsatisfiedLinkError`: the CUDA JARs are dynamically linked against the
+> CUDA runtime (`libcudart.so.13` on Linux, `cudart64_13.dll` /
+> `cublas64_13.dll` / `cublasLt64_13.dll` on Windows — the Windows CUDA
+> runtime is **not bundled**, install the CUDA 13 Toolkit), the Vulkan JAR
+> needs a Vulkan runtime (`vulkan-1.dll`, shipped with current GPU drivers),
+> and the OpenCL JARs need a vendor OpenCL ICD. There is no automatic
+> fallback to CPU. If you want a single artifact that works on both CPU and
+> GPU hosts, depend on the default (CPU) JAR; users who want GPU acceleration
+> on an unlisted platform must compile locally with the matching `-DGGML_*=ON`
+> flag (see [Setup required](#setup-required)).
 
 > [!NOTE]
 > Android `armeabi-v7a` (32-bit ARM) is **not** published. Only 64-bit
diff --git a/TODO.md b/TODO.md
index d28f1d1b..66de274f 100644
--- a/TODO.md
+++ b/TODO.md
@@ -175,46 +175,51 @@ primary goal: agentic tool-calling with Qwen):
   JDK-based `OpenAiCompatServer` (which is complete and the primary surface); value is shipping the full
   llama.cpp server + WebUI in-process without a separate `llama-server` binary. JNI + C++ work.
 
-### Windows compiler cache (sccache) — dual build shipped (MSVC default + Ninja classifier)
-
-**Design decision (do not revisit without the owner): the MSVC / Visual Studio build is the
-default JAR and is kept permanently — never retired.** The Ninja Multi-Config build is shipped
-*alongside* it as the `ninja-windows` classifier JAR, never as a replacement. The loss of the
-sccache cache on the MSVC build is accepted; the Ninja build exists so a cache-accelerated,
-independently validated second Windows artifact is available for users to compare/adopt.
-
-**Why two builds.** The cache mechanism is the CMake *compiler launcher*
-(`-DCMAKE_C_COMPILER_LAUNCHER=sccache`). **The Visual Studio generator ignores it entirely**
-(only Ninja/Makefile generators honor it), so the MSVC jobs can never cache. The Ninja
-Multi-Config generator *does* honor it (upstream llama.cpp `b9739` ships `windows-cuda` this way,
-proving Ninja Multi-Config + MSVC works on the same tree). The two builds produce **different
-`jllama.dll`s**, so they cannot coexist at the same resource path in one JAR — hence the classifier.
-
-**What shipped (this branch):**
-- **4 Windows build jobs, all permanent:** `build-windows-x86_64`, `build-windows-x86` (MSVC,
-  default JAR) and `build-windows-x86_64-ninja`, `build-windows-x86-ninja` (Ninja + sccache/Depot).
-- **Both tested end-to-end:** all four run the C++ unit tests (`ctest`); `test-java-windows-x86_64`
-  (MSVC) and the new `test-java-windows-x86_64-ninja` (Ninja) both load the DLL via JNI and run the
-  full model-backed Java suite.
-- **`.github/build.bat`** — sccache probe guard (mirrors `build.sh`'s `sccache_can_wrap_compiler()`):
-  `USE_CACHE=true` + `sccache` on PATH + a trivial TU compiling through `sccache cl.exe` ⇒
-  `-DCMAKE_{C,CXX}_COMPILER_LAUNCHER=sccache` + `sccache --show-stats`; else green uncached. Inert
-  for the MSVC jobs (they don't set `USE_CACHE`).
-- **`pom.xml`** — `windows-ninja` profile → `<classifier>ninja-windows</classifier>` JAR from
-  `${project.build.outputDirectory}_windows_ninja` (mirrors the `cuda` / `opencl-android` profiles).
-- **`publish.yml`** — the `package`, `publish-snapshot`, `publish-release` jobs download
-  `Windows-{x86_64,x86}-ninja` into `src/main/resources_windows_ninja/` and activate the
-  `windows-ninja` profile; the Ninja build + Java-test jobs are in the `package` `needs:` graph.
-- Docs: `README.md` classifier table + `CLAUDE.md` "Windows Ninja artifact" section.
-
-**Verification — DONE (PR #248).** The Ninja jobs are green and cache-warm: `Build and Test
-Windows … (Ninja … sccache, eval)` builds + `ctest` pass, and `Java Tests Windows 2025 x86_64
-(Ninja, eval)` loads the DLL via JNI and runs the full model-backed suite green (after the b9739
-arg-parse patch landed). `sccache --show-stats` confirms cache hits on the Ninja jobs.
-
-**Optional follow-up:** smoke-test that the *published* `ninja-windows` classifier JAR loads its DLL
-on a clean Windows host. Publishing is gated behind `publish_to_central`, so a broken Windows job
-blocks the release before any artifact reaches Central/GitHub Releases.
+### Windows native classifiers — default flip (Ninja default + MSVC classifier) + CUDA/Vulkan/OpenCL GPU
+
+**Design decision UPDATED by the owner (supersedes the earlier "MSVC is the permanent default"
+note): the default Windows CPU JAR is now the Ninja Multi-Config build, and the MSVC / Visual
+Studio build ships as the `msvc-windows` classifier.** Rationale: both generators use the same MSVC
+toolchain (`cl.exe`, static `/MT` CRT) on the same runner, so the produced DLLs are functionally
+equivalent with identical runtime dependencies — the only difference is build-system plumbing +
+sccache caching. Making Ninja the default gives the most-pulled JAR the cache; MSVC stays available
+as a classifier. Three Windows GPU classifiers were added at the same time (x86_64 only, all Ninja):
+`cuda13-windows-x86-64`, `vulkan-windows-x86-64`, `opencl-windows-x86-64`.
+
+**Why the cache needs Ninja.** The cache mechanism is the CMake *compiler launcher*
+(`-DCMAKE_C_COMPILER_LAUNCHER=sccache`); the Visual Studio generator ignores it entirely, only
+Ninja/Makefile generators honor it. Upstream llama.cpp also builds its Windows artifacts with Ninja
+Multi-Config + MSVC.
+
+**What shipped (this branch — pending first CI validation):**
+- **CPU build jobs:** `build-windows-x86_64` / `build-windows-x86` are now **Ninja** (default,
+  artifacts `Windows-{arch}-libraries`); `build-windows-x86_64-msvc` / `build-windows-x86-msvc` are
+  **MSVC** (artifacts `Windows-{arch}-msvc`). `test-java-windows-x86_64` (default/Ninja) and
+  `test-java-windows-x86_64-msvc` both load the DLL via JNI and run the full model-backed suite.
+- **GPU build jobs (x86_64, Ninja, build the artifact only — runners have no GPU, and a
+  GPU-linked jllama_test can't be enumerated there; C++ suite runs on the CPU jobs):**
+  `build-windows-x86_64-cuda` (`Jimver/cuda-toolkit@v0.2.35` CUDA `13.2.0` + `-DGGML_CUDA=ON`),
+  `build-windows-x86_64-vulkan` (`jakoch/install-vulkan-sdk-action` + `-DGGML_VULKAN=ON`),
+  `build-windows-x86_64-opencl` (`build_opencl_windows.bat` stages the ICD loader + `-DGGML_OPENCL=ON`).
+- **`CMakeLists.txt`** — OS-aware backend routing (CUDA/OpenCL → Windows trees, new Vulkan branch).
+- **`.github/build.bat`** — also wraps nvcc with sccache for CUDA builds.
+- **`.github/build_opencl_windows.bat`** — new, Windows analogue of `build_opencl_android.sh`.
+- **`pom.xml`** — profiles `windows-msvc` / `cuda-windows` / `vulkan-windows` / `opencl-windows`
+  (classifiers `msvc-windows` / `cuda13-windows-x86-64` / `vulkan-windows-x86-64` / `opencl-windows-x86-64`).
+- **`publish.yml`** — the `package` / `publish-snapshot` / `publish-release` jobs download each
+  non-default artifact into `src/main/resources_windows_{msvc,cuda,vulkan,opencl}/` and activate the
+  four profiles; all five Windows build jobs are in the `package` `needs:` graph.
+- Docs: `README.md` classifier table + `CLAUDE.md` "Windows native classifiers" section.
+
+**Verification — first CI run done (PR #276, run 28327740376).** Green on the first try: default Ninja
+CPU flip (x64+x86), MSVC classifier (x64+x86), and the **OpenCL** GPU job (`build_opencl_windows.bat`
+ICD staging works). Two GPU jobs were fixed after the first run: **CUDA** (`Version not available:
+13.0.0` → bumped `Jimver/cuda-toolkit` `v0.2.24`→`v0.2.35` + `13.2.0`) and **Vulkan**
+(`find_package(Vulkan)` couldn't read the `humbletim` SDK layout → switched to
+`jakoch/install-vulkan-sdk-action`). Re-run pending to confirm both fixes.
+
+**Optional follow-up:** smoke-test that each *published* classifier JAR loads its DLL on a clean
+Windows host with the matching GPU driver/toolkit installed.
 
 **Reference notes:**
 - Cache backend is **sccache + Depot WebDAV** (consistent with the other 8 jobs — one token, shared
diff --git a/pom.xml b/pom.xml
index a04265f0..6c451877 100644
--- a/pom.xml
+++ b/pom.xml
@@ -913,24 +913,25 @@ SPDX-License-Identifier: MIT
 			</build>
 		</profile>
 		<profile>
-			<!-- Windows Ninja Multi-Config + sccache build, shipped as a separate
-			     classifier JAR alongside the default (MSVC/Visual-Studio-generator)
-			     Windows natives. The default JAR keeps the trusted MSVC jllama.dll;
-			     consumers who want the Ninja-built natives add the
-			     `ninja-windows` classifier. Mirrors the cuda / opencl-android
-			     classifier pattern: a separate compile pass + resource copy +
-			     classified jar. The Ninja DLLs are staged into
-			     src/main/resources_windows_ninja/ by CI before this profile runs. -->
-			<id>windows-ninja</id>
+			<!-- MSVC (Visual Studio generator) Windows CPU natives, shipped as the
+			     `msvc-windows` classifier JAR. After the default flip the DEFAULT JAR
+			     ships the Ninja Multi-Config natives (sccache-cached, merged into
+			     src/main/resources/ by CI); this profile packages the alternative
+			     MSVC-generator build for anyone who wants it. Both use the same MSVC
+			     toolchain (static /MT CRT) so they are functionally equivalent. Mirrors
+			     the cuda / opencl-android classifier pattern: a separate compile pass +
+			     resource copy + classified jar. The MSVC DLLs are staged into
+			     src/main/resources_windows_msvc/ by CI before this profile runs. -->
+			<id>windows-msvc</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.apache.maven.plugins</groupId>
 						<artifactId>maven-compiler-plugin</artifactId>
 						<executions>
-						<!-- Separate build pass for the windows-ninja classifier -->
+						<!-- Separate build pass for the windows-msvc classifier -->
 						<execution>
-							<id>windows-ninja</id>
+							<id>windows-msvc</id>
 							<phase>compile</phase>
 							<goals>
 								<goal>compile</goal>
@@ -946,7 +947,7 @@ SPDX-License-Identifier: MIT
 									<arg>src/main/cpp</arg>
 								</compilerArgs>
 								<outputDirectory>
-									${project.build.outputDirectory}_windows_ninja</outputDirectory>
+									${project.build.outputDirectory}_windows_msvc</outputDirectory>
 							</configuration>
 						</execution>
 						</executions>
@@ -956,18 +957,18 @@ SPDX-License-Identifier: MIT
 						<executions>
 						<!-- Copy the Ninja-built Windows libs to the output directory -->
 						<execution>
-							<id>copy-resources-windows-ninja</id>
+							<id>copy-resources-windows-msvc</id>
 							<phase>process-classes</phase>
 							<goals>
 								<goal>copy-resources</goal>
 							</goals>
 							<configuration>
 								<outputDirectory>
-									${project.build.outputDirectory}_windows_ninja</outputDirectory>
+									${project.build.outputDirectory}_windows_msvc</outputDirectory>
 								<resources>
 									<resource>
 										<directory>
-											${basedir}/src/main/resources_windows_ninja/</directory>
+											${basedir}/src/main/resources_windows_msvc/</directory>
 										<includes>
 											<include>**/*.*</include>
 										</includes>
@@ -984,15 +985,258 @@ SPDX-License-Identifier: MIT
 							<!-- Pick class files AND libs from custom output
 							directory -->
 							<execution>
-								<id>windows-ninja</id>
+								<id>windows-msvc</id>
 								<phase>package</phase>
 								<goals>
 									<goal>jar</goal>
 								</goals>
 								<configuration>
-									<classifier>ninja-windows</classifier>
+									<classifier>msvc-windows</classifier>
 									<classesDirectory>
-										${project.build.outputDirectory}_windows_ninja</classesDirectory>
+										${project.build.outputDirectory}_windows_msvc</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<!-- Windows CUDA GPU natives (x86_64 only), shipped as the
+			     `cuda13-windows-x86-64` classifier JAR. Built with -DGGML_CUDA=ON and
+			     routed by CMakeLists.txt to src/main/resources_windows_cuda/. The CUDA
+			     runtime libs (cudart64_13 / cublas64_13 / cublasLt64_13) are NOT bundled
+			     — the consumer's installed CUDA Toolkit provides them. Mirrors the
+			     cuda (Linux) classifier pattern. Staged by CI before this profile runs. -->
+			<id>cuda-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>cuda-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_cuda</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-cuda-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_cuda</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_cuda/</directory>
+										<includes>
+											<include>**/*.*</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>cuda-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>cuda13-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_cuda</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<!-- Windows Vulkan GPU natives (x86_64 only), shipped as the
+			     `vulkan-windows-x86-64` classifier JAR. Built with -DGGML_VULKAN=ON and
+			     routed by CMakeLists.txt to src/main/resources_windows_vulkan/. The Vulkan
+			     runtime (vulkan-1.dll) ships with the GPU driver — nothing is bundled.
+			     Vulkan is the most portable Windows GPU backend (NVIDIA/AMD/Intel).
+			     Staged by CI before this profile runs. -->
+			<id>vulkan-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>vulkan-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_vulkan</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-vulkan-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_vulkan</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_vulkan/</directory>
+										<includes>
+											<include>**/*.*</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>vulkan-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>vulkan-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_vulkan</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<!-- Windows OpenCL GPU natives (x86_64 only), shipped as the
+			     `opencl-windows-x86-64` classifier JAR. Built with -DGGML_OPENCL=ON and
+			     routed by CMakeLists.txt to src/main/resources_windows_opencl/. The OpenCL
+			     ICD (System32\OpenCL.dll) ships with the GPU driver — nothing is bundled.
+			     NOTE: the GGML OpenCL backend is Adreno-tuned; on desktop GPUs CUDA/Vulkan
+			     are better supported. Staged by CI before this profile runs. -->
+			<id>opencl-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>opencl-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_opencl</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-opencl-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_opencl</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_opencl/</directory>
+										<includes>
+											<include>**/*.*</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>opencl-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>opencl-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_opencl</classesDirectory>
 								</configuration>
 							</execution>
 						</executions>