diff --git a/.github/validate-models.bat b/.github/validate-models.bat index 76fd4e5a..17886089 100644 --- a/.github/validate-models.bat +++ b/.github/validate-models.bat @@ -9,12 +9,14 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF") setlocal enabledelayedexpansion -set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" +REM Every CI Java test job (incl. Windows) now downloads the full model set before +REM validating and runs the embedding / vision / TTS integration tests, so all of +REM these are REQUIRED (a missing one is a hard failure, not a silent self-skip). +set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" "models\nomic-embed-text-v1.5.f16.gguf" "models\SmolVLM-500M-Instruct-Q8_0.gguf" "models\mmproj-SmolVLM-500M-Instruct-Q8_0.gguf" "models\OuteTTS-0.2-500M-Q4_K_M.gguf" "models\WavTokenizer-Large-75-F16.gguf" -REM Vision GGUFs are validated only when present (the Windows job downloads -REM them too, but the validation step must not fail when a future job opts -REM out of the vision matrix). -set "OPTIONAL_MODELS=models\SmolVLM-500M-Instruct-Q8_0.gguf" "models\mmproj-SmolVLM-500M-Instruct-Q8_0.gguf" +REM No optional models remain (the audio-input model has no CI download and its +REM test self-skips). Left empty so the optional loop below is a no-op. +set "OPTIONAL_MODELS=" echo Validating required model files... for %%M in (%MODELS%) do ( diff --git a/.github/validate-models.sh b/.github/validate-models.sh index 128d95e1..efb081b1 100755 --- a/.github/validate-models.sh +++ b/.github/validate-models.sh @@ -10,19 +10,17 @@ set -e +# Every CI Java test job (Linux + all macOS + all Windows) now downloads the full +# model set before validating, and runs the embedding / vision / TTS integration +# tests with their properties set — so all of these are REQUIRED, not optional. A +# missing model is a hard failure here (it would otherwise let an integration test +# silently self-skip). See .github/workflows/publish.yml. MODELS=( "models/codellama-7b.Q2_K.gguf" "models/jina-reranker-v1-tiny-en-Q4_0.gguf" "models/AMD-Llama-135m-code.Q2_K.gguf" "models/Qwen3-0.6B-Q4_K_M.gguf" "models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" -) - -# Optional GGUFs validated only when present so jobs that do not download -# them (e.g. cross-compile smoke runs) still pass. The vision test image is -# committed to src/test/resources/images/test-image.jpg and is not validated -# here — its presence is asserted directly by MultimodalIntegrationTest. -OPTIONAL_MODELS=( "models/nomic-embed-text-v1.5.f16.gguf" "models/SmolVLM-500M-Instruct-Q8_0.gguf" "models/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf" @@ -30,6 +28,13 @@ OPTIONAL_MODELS=( "models/WavTokenizer-Large-75-F16.gguf" ) +# Optional GGUFs validated only when present. The vision test image is committed to +# src/test/resources/images/test-image.jpg and is not validated here — its presence +# is asserted directly by MultimodalIntegrationTest. The audio-input model +# (AudioInputIntegrationTest) has no committed clip and no CI download, so that test +# self-skips and its model is intentionally not listed here. +OPTIONAL_MODELS=() + validate_gguf() { local model="$1" local required="$2" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0f63804e..c2d82f36 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -33,7 +33,7 @@ env: TOOL_MODEL_NAME: "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" NOMIC_EMBED_MODEL_URL: "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf" NOMIC_EMBED_MODEL_NAME: "nomic-embed-text-v1.5.f16.gguf" - # Vision model + mmproj for MultimodalIntegrationTest (upstream kherud/java-llama.cpp#103 / #34). + # Vision model + mmproj for MultimodalIntegrationTest. # SmolVLM-500M is the smallest community vision GGUF that loads reliably # under the upstream mtmd pipeline. Total download ~600 MB across model # plus mmproj; matches the existing per-test-job download budget. @@ -786,10 +786,14 @@ jobs: run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download nomic embedding model (issue #98 regression) run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download vision model run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + - name: Download TTS model (OuteTTS) + run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME} + - name: Download TTS vocoder (WavTokenizer) + run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -804,10 +808,6 @@ jobs: run: | ulimit -c unlimited echo "${{ github.workspace }}/core.%e.%p" | sudo tee /proc/sys/kernel/core_pattern - - name: Download TTS model (OuteTTS) - run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME} - - name: Download TTS vocoder (WavTokenizer) - run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME} - name: Run tests run: | mvn -e --no-transfer-progress -P jcstress test \ @@ -927,10 +927,16 @@ jobs: run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download nomic embedding model (issue #98 regression) + run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} + - name: Download vision model run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + - name: Download TTS model (OuteTTS) + run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME} + - name: Download TTS vocoder (WavTokenizer) + run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -947,9 +953,12 @@ jobs: run: | mvn -e --no-transfer-progress -Dnet.ladenthin.llama.test.ngl=0 test \ -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \ + -Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \ - -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} + -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} \ + -Dnet.ladenthin.llama.tts.ttc.model=models/${TTS_MODEL_NAME} \ + -Dnet.ladenthin.llama.tts.vocoder.model=models/${TTS_VOCODER_NAME} - name: Memory after tests if: always() run: vm_stat && sysctl hw.memsize hw.physmem @@ -1007,10 +1016,16 @@ jobs: run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download nomic embedding model (issue #98 regression) + run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} + - name: Download vision model run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + - name: Download TTS model (OuteTTS) + run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME} + - name: Download TTS vocoder (WavTokenizer) + run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -1027,9 +1042,12 @@ jobs: run: | mvn -e --no-transfer-progress test \ -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \ + -Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \ - -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} + -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} \ + -Dnet.ladenthin.llama.tts.ttc.model=models/${TTS_MODEL_NAME} \ + -Dnet.ladenthin.llama.tts.vocoder.model=models/${TTS_VOCODER_NAME} - name: Memory after tests if: always() run: vm_stat && sysctl hw.memsize hw.physmem @@ -1087,10 +1105,16 @@ jobs: run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download nomic embedding model (issue #98 regression) + run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} + - name: Download vision model run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + - name: Download TTS model (OuteTTS) + run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME} + - name: Download TTS vocoder (WavTokenizer) + run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -1107,9 +1131,12 @@ jobs: run: | mvn -e --no-transfer-progress test \ -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \ + -Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \ -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \ - -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} + -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} \ + -Dnet.ladenthin.llama.tts.ttc.model=models/${TTS_MODEL_NAME} \ + -Dnet.ladenthin.llama.tts.vocoder.model=models/${TTS_VOCODER_NAME} - name: Memory after tests if: always() run: vm_stat && sysctl hw.memsize hw.physmem @@ -1164,10 +1191,16 @@ jobs: run: if (-not (Test-Path "models/$env:REASONING_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME } - name: Download tool-calling model run: if (-not (Test-Path "models/$env:TOOL_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME } - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download nomic embedding model (issue #98 regression) + run: if (-not (Test-Path "models/$env:NOMIC_EMBED_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:NOMIC_EMBED_MODEL_URL --create-dirs -o models/$env:NOMIC_EMBED_MODEL_NAME } + - name: Download vision model run: if (-not (Test-Path "models/$env:VISION_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME } - name: Download vision mmproj run: if (-not (Test-Path "models/$env:VISION_MMPROJ_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME } + - name: Download TTS model (OuteTTS) + run: if (-not (Test-Path "models/$env:TTS_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TTS_MODEL_URL --create-dirs -o models/$env:TTS_MODEL_NAME } + - name: Download TTS vocoder (WavTokenizer) + run: if (-not (Test-Path "models/$env:TTS_VOCODER_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TTS_VOCODER_URL --create-dirs -o models/$env:TTS_VOCODER_NAME } - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -1200,9 +1233,12 @@ jobs: run: | mvn -e --no-transfer-progress test ` "-Dnet.ladenthin.llama.tool.model=models/$env:TOOL_MODEL_NAME" ` + "-Dnet.ladenthin.llama.nomic.path=models/$env:NOMIC_EMBED_MODEL_NAME" ` "-Dnet.ladenthin.llama.vision.model=models/$env:VISION_MODEL_NAME" ` "-Dnet.ladenthin.llama.vision.mmproj=models/$env:VISION_MMPROJ_NAME" ` - "-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH" + "-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH" ` + "-Dnet.ladenthin.llama.tts.ttc.model=models/$env:TTS_MODEL_NAME" ` + "-Dnet.ladenthin.llama.tts.vocoder.model=models/$env:TTS_VOCODER_NAME" - name: Memory after tests if: always() run: Get-CimInstance Win32_OperatingSystem | Select-Object FreePhysicalMemory,TotalVisibleMemorySize | Format-List @@ -1264,10 +1300,16 @@ jobs: run: if (-not (Test-Path "models/$env:REASONING_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME } - name: Download tool-calling model run: if (-not (Test-Path "models/$env:TOOL_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME } - - name: Download vision model (upstream kherud/java-llama.cpp#103 / #34) + - name: Download nomic embedding model (issue #98 regression) + run: if (-not (Test-Path "models/$env:NOMIC_EMBED_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:NOMIC_EMBED_MODEL_URL --create-dirs -o models/$env:NOMIC_EMBED_MODEL_NAME } + - name: Download vision model run: if (-not (Test-Path "models/$env:VISION_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME } - name: Download vision mmproj run: if (-not (Test-Path "models/$env:VISION_MMPROJ_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME } + - name: Download TTS model (OuteTTS) + run: if (-not (Test-Path "models/$env:TTS_MODEL_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TTS_MODEL_URL --create-dirs -o models/$env:TTS_MODEL_NAME } + - name: Download TTS vocoder (WavTokenizer) + run: if (-not (Test-Path "models/$env:TTS_VOCODER_NAME")) { curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TTS_VOCODER_URL --create-dirs -o models/$env:TTS_VOCODER_NAME } - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -1300,9 +1342,12 @@ jobs: run: | mvn -e --no-transfer-progress test ` "-Dnet.ladenthin.llama.tool.model=models/$env:TOOL_MODEL_NAME" ` + "-Dnet.ladenthin.llama.nomic.path=models/$env:NOMIC_EMBED_MODEL_NAME" ` "-Dnet.ladenthin.llama.vision.model=models/$env:VISION_MODEL_NAME" ` "-Dnet.ladenthin.llama.vision.mmproj=models/$env:VISION_MMPROJ_NAME" ` - "-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH" + "-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH" ` + "-Dnet.ladenthin.llama.tts.ttc.model=models/$env:TTS_MODEL_NAME" ` + "-Dnet.ladenthin.llama.tts.vocoder.model=models/$env:TTS_VOCODER_NAME" - name: Memory after tests if: always() run: Get-CimInstance Win32_OperatingSystem | Select-Object FreePhysicalMemory,TotalVisibleMemorySize | Format-List diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d0a0239..264e0160 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by - End-to-end vision input across blocking, typed `ChatRequest`, streaming, and OpenAI-compatible request mapping; real-model tests verify that distinct red and blue images produce the correct semantic answers. - Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags. - Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`. +- Per-request DRY sampling to `InferenceParameters` (`dry_multiplier`/`dry_base`/`dry_allowed_length`/`dry_penalty_last_n`/`dry_sequence_breakers`). - Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`. - Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server. @@ -27,9 +28,12 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by - README license badge corrected from "Apache 2.0" to "MIT" (matches `LICENSE` file and `pom.xml`). - `pom.xml` SCM URL: `tree/master` → `tree/main` (default branch renamed). - Upgraded llama.cpp from b9151 to b9172. +- Upgraded llama.cpp from b9803 to b9829. Compiles the new upstream `server-stream.cpp` (resumable-streaming SSE replay buffer) into `libjllama`, required because `server-context`/`server-http`/`server-models` now reference its symbols; refreshed `patches/0001` for the `tests/test-export-graph-ops.cpp` rename and the `server.cpp` GC-init context shift. +- `configureParallelInference` now applies `slot_prompt_similarity` live via `server_context::set_slot_prompt_similarity()` (upstream PR ggml-org/llama.cpp#22393, carried as `patches/0003` until merged), instead of validating it and discarding the value. - Extracted the `chatWithTools` agent loop into `ToolCallingAgent`; tool-result errors (unknown tool / handler exception) are now JSON-serialized so tool names containing special characters remain valid JSON. ### Fixed +- Per-request `reasoning_budget_tokens` is now honored (via `patches/0004`, upstream PR ggml-org/llama.cpp#23116): `reasoning_budget_tokens=0` suppresses thinking. `ReasoningBudgetTest` now asserts the suppression directly (the previous test that pinned the unfixed-bug behavior was removed). - Preserved decoded image buffers across the JNI chat boundary and submitted media requests through llama.cpp's upstream multimodal task path instead of silently tokenizing them as text-only prompts. - Preserved multipart image content when using the typed `ChatRequest` serializer. - The standalone OpenAI-compatible server now advertises vision only when the loaded model confirms usable vision support. diff --git a/CLAUDE.md b/CLAUDE.md index 676fe0f4..1c2094d8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b9803** +Current llama.cpp pinned version: **b9829** ## Upgrading CUDA Version @@ -241,7 +241,7 @@ needs no extra step here, `build-webui` re-reads the tag and rebuilds the matchi ships no UI): ```bash # needs node/npm + network; embed.cpp is plain C++17 (no npm) -git clone --depth 1 --branch b9803 https://github.com/ggml-org/llama.cpp /tmp/lc +git clone --depth 1 --branch b9829 https://github.com/ggml-org/llama.cpp /tmp/lc ( cd /tmp/lc/tools/ui && npm ci && npm run build \ && ( cd dist && find . -type f -not -path './_gzip/*' \ | while read -r f; do mkdir -p "_gzip/$(dirname "$f")"; gzip -9 -c "$f" > "_gzip/$f"; done ) \ @@ -275,7 +275,7 @@ plus a cache token are present, `build.sh` adds - `SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}` — a Depot **organization** token, stored as the repo secret **`DEPOT_TOKEN`**. -Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9803`), the +Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9829`), the ~280 upstream object files are byte-identical every run, so a warm cache recompiles only the *changed* files. Depot's cache is **shared across all branches** (unlike GitHub's per-branch `actions/cache`), so every branch builds incrementally; a `b` version bump @@ -384,6 +384,8 @@ Current patches: |-------|-------| | `0001-win32-arg-parse-embed-guard.patch` | Windows JNI regression from llama.cpp **#24779** (introduced b9739): on Windows `common_params_parse` re-derived argv from the **process** command line (`GetCommandLineW`) and adopted it, so an embedded/JNI caller (`java.exe`) lost its `--model …` args → "Failed to parse model parameters". b9789 narrowed the unconditional override to a **count-guard** (`if (static_cast(utf8.buf.size()) == argc) { argv = utf8.ptrs.data(); }`), but that is exactly the variant the project already found breaks its Windows server-integration tests (when the embedded argv length coincides with `java.exe`'s). The patch carries the **complete upstream change** (so it can be submitted to llama.cpp verbatim and then dropped here): **(1)** `common_params_parse` parses **exactly the argv it is given** (no `GetCommandLineW` magic) and a new `common_params_parse_main()` wrapper holds the UTF-8 recovery for the standalone tools' `main()` (`common/arg.{cpp,h}`); **(2)** the **~34 standalone `main()` call sites** (every `common_params_parse(argc, argv, …)` across `tools/*`, `examples/*` and the `tests/*` programs) flip to `common_params_parse_main()`; **(3)** a `tests/test-arg-parser.cpp` regression case pins that `common_params_parse` honors a caller-supplied argv. The embedded caller (`jllama.cpp`) keeps calling `common_params_parse` and is never overridden. **Our subproject build compiles only the `arg.{cpp,h}` core** — `LLAMA_BUILD_TOOLS`/`LLAMA_BUILD_TESTS` are OFF for a FetchContent subproject — so the flips + test are applied-but-not-compiled here; they were validated via a one-off `-DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_TESTS=ON` build (the new test compiles and its asserts pass; `test-arg-parser`'s only red there is the live `ggml.ai` download check, which is sandbox-network, not the patch). Because it spans **37 files** it must be refreshed on every llama.cpp bump (the applier fails loud). | | `0002-server-preserve-caller-load-progress-callback.patch` | Load-progress-callback regression introduced in llama.cpp **b9789**: `server_context::load_model` (`tools/server/server-context.cpp`) now **unconditionally** installs the server's own load-progress reporter on `params_base.load_progress_callback` immediately before `common_init_from_params`, clobbering any callback the embedding caller already set. libjllama's `LoadProgressCallback` feature wires `common_params.load_progress_callback` to a JNI trampoline *before* calling `load_model`, so the bump silently killed it — `LoadProgressCallbackTest` saw zero progress updates and the abort-on-`false` path never threw. The patch guards the assignment with `if (params_base.load_progress_callback == nullptr)`, so the server installs its own reporter **only when the caller hasn't** — a caller-supplied callback survives and fires during load. Standalone `llama-server` (no caller callback, so the field is null) is unaffected. Same JNI-vs-standalone divergence class as `0001`. | +| `0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#22393](https://github.com/ggml-org/llama.cpp/pull/22393) ("server : add slot_prompt_similarity getter/setter") while it is still open upstream. Purely additive: adds `server_context::get_slot_prompt_similarity()` / `set_slot_prompt_similarity(float)` (`tools/server/server-context.{cpp,h}`) so an embedding/JNI caller can query and tune the slot-selection threshold at runtime without reloading the model. Verbatim copy of the PR — drop it once a pinned `b` includes the change. | +| `0004-pr23116-server-per-request-reasoning-budget-tokens.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#23116](https://github.com/ggml-org/llama.cpp/pull/23116) ("server: honour per-request reasoning_budget_tokens in chat completions"), motivated by java-llama.cpp#140, while it is still open upstream. `oaicompat_chat_params_parse` (`tools/server/server-common.cpp`) only read the Anthropic `thinking_budget_tokens` alias and always wrote the server-level `reasoning_budget_message`, so a per-request `reasoning_budget_tokens` / `reasoning_budget_message` on a chat-completions request was ignored. The patch reads both overrides **before** the generic copy loop (precedence: `reasoning_budget_tokens` > `thinking_budget_tokens` alias > server default) and threads the per-request message through. Carries the upstream `tests/test-chat.cpp` additions verbatim so the patch is submittable as-is; like `0001`'s test/call-site flips they are **applied-but-not-compiled** here (`LLAMA_BUILD_TESTS` is OFF for the FetchContent subproject). Drop it once a pinned `b` includes the change. | ## OuteTTS build-time extraction (`cmake/generate-tts-upstream.cmake`) @@ -618,7 +620,7 @@ the README. The summary below covers only the optional-model bindings: | Property | Default test that uses it | Model | |----------|---------------------------|-------| | `net.ladenthin.llama.nomic.path` | `LlamaEmbeddingsTest#testNomicEmbedLoads` | `nomic-embed-text-v1.5.f16.gguf` (issue #98 regression) | -| `net.ladenthin.llama.vision.model` | `MultimodalIntegrationTest` (upstream kherud/java-llama.cpp#103 / #34) | `SmolVLM-500M-Instruct-Q8_0.gguf` (any vision-capable GGUF works) | +| `net.ladenthin.llama.vision.model` | `MultimodalIntegrationTest` | `SmolVLM-500M-Instruct-Q8_0.gguf` (any vision-capable GGUF works) | | `net.ladenthin.llama.vision.mmproj` | `MultimodalIntegrationTest` | matching mmproj for the vision model, e.g. `mmproj-SmolVLM-500M-Instruct-Q8_0.gguf` | | `net.ladenthin.llama.vision.image` | `MultimodalIntegrationTest` | committed default `src/test/resources/images/test-image.jpg`; override to any png/jpeg/webp/gif on disk | | `net.ladenthin.llama.audio.model` | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | audio-input model GGUF, e.g. `ultravox-v0_5-llama-3_2-1b.gguf` | @@ -797,7 +799,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in - `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable. - `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`. - Uses `nlohmann/json` for JSON deserialization of parameters. -- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-models.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). Only `server.cpp` (the standalone `main()` + route wiring) remains excluded — wiring the routes to JNI is the next step. +- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-schema.cpp`, `server-models.cpp`, and — since b9829 — `server-stream.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **`server-stream.cpp` is mandatory, not optional:** it defines the resumable-streaming SSE replay buffer (`g_stream_sessions`, `stream_session_attach_pipe`, `stream_aware_should_stop`, `stream_conv_id_from_headers`, the `stream_pipe_*` types) that `server-context.cpp` / `server-http.cpp` / `server-models.cpp` now `#include "server-stream.h"` and call, so omitting it fails the link with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits outside the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so its GC thread stays dormant. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). Only `server.cpp` (the standalone `main()` + route wiring) remains excluded — wiring the routes to JNI is the next step. ### Native Helper Architecture @@ -914,6 +916,22 @@ Require a model file. The CI downloads models from HuggingFace: - **LlamaModel tests**: CodeLlama-7B-GGUF (`codellama-7b.Q2_K.gguf`) - **RerankingModel tests**: Jina-Reranker model +**CI model policy (publish.yml): the full model set is downloaded and exercised on EVERY +Java test job** — Linux x86_64, all three macOS arm64 jobs (Metal / no-Metal / Metal-15), and +both Windows jobs (MSVC + Ninja). That includes the nomic embedding model, the SmolVLM vision +model + mmproj, and the OuteTTS + WavTokenizer TTS pair, with their `-Dnet.ladenthin.llama.*` +properties set, so `LlamaEmbeddingsTest`, `MultimodalIntegrationTest`, and `TtsIntegrationTest` +**run on every platform** rather than self-skipping. `validate-models.{sh,bat}` treats all of +these as **required** (a missing model hard-fails the job before tests run, so a download +regression can never silently downgrade to a skip). The only model still self-skipping is the +audio-input model (`AudioInputIntegrationTest`) — it has no committed clip and no CI download. +The shared GGUF cache (`actions/cache`, key `gguf-models-v1`, path `models/`) holds the full set; +since every test job downloads the full set before the cache can save, whichever job wins the +save race caches everything. Because the cache key is immutable, changing the model set means the +**existing cache entry must be deleted** (not bumped to `v2`) so the next run rebuilds it complete +— locally the model tests still self-skip when a GGUF is absent (`Assume.assumeTrue`), so a +partial local checkout is fine. + Set the model path via system property or environment variable (see test files for exact property names). Test files are in `src/test/java/net/ladenthin/llama/` and `src/test/java/examples/`. @@ -947,17 +965,17 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" | File | Tests | Scope | |------|-------|-------| | `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` | -| `src/test/cpp/test_server.cpp` | 189 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection | +| `src/test/cpp/test_server.cpp` | 194 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths + per-request `dry_*` field round-trips), `response_fields` projection | | `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` | | `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` | | `src/test/cpp/test_jni_helpers.cpp` | 47 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock | | `src/test/cpp/test_tts_wav.cpp` | 2 | The in-memory WAV writer `pcm_to_wav16_bytes` in `tts_wav.hpp` (WAV header/payload + little-endian clamping). The OuteTTS DSP it pairs with is derived from upstream `tts.cpp` and covered end-to-end by the Java `TtsIntegrationTest`, not unit-tested here. | -**Current total: 454 tests (all passing).** +**Current total: 459 tests (all passing).** #### Upstream source location (in CMake build tree) -llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9803`. +llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9829`. **GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a1eab6e..8e3e7846 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,7 +143,7 @@ set(LLAMA_BUILD_APP OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b9803 + GIT_TAG b9829 PATCH_COMMAND ${CMAKE_COMMAND} -DPATCH_DIR=${CMAKE_CURRENT_SOURCE_DIR}/patches -DLLAMA_SRC= @@ -166,7 +166,7 @@ execute_process( COMMAND ${CMAKE_COMMAND} -DTTS_SRC=${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp -DOUT_CPP=${JLLAMA_TTS_GEN_CPP} - -DLLAMA_TAG=b9803 + -DLLAMA_TAG=b9829 -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate-tts-upstream.cmake RESULT_VARIABLE JLLAMA_TTS_GEN_RESULT ) @@ -306,11 +306,23 @@ endif() # in subprocess.h which calls posix_spawn_*, declared but not implemented by the # Android NDK. Guard with both ANDROID_ABI (NDK toolchain convention) and # OS_NAME (always set to "Linux-Android" by the CI cmake invocation). +# +# server-stream.cpp (added upstream in b9829) owns the resumable-streaming SSE +# replay buffer (g_stream_sessions, stream_session_attach_pipe, +# stream_aware_should_stop, stream_conv_id_from_headers, stream_pipe_*). +# server-context.cpp / server-http.cpp / server-models.cpp now #include +# "server-stream.h" and reference those symbols, so it MUST be compiled in or the +# link fails with undefined references. It is platform-neutral (threads + the +# std mutex/condvar primitives, no subprocess.h / posix_spawn_*), so it builds on +# Android too and stays outside the server-models Android guard below. jllama +# wires its own routes and never calls g_stream_sessions.start_gc() (only the +# standalone server.cpp main does), so the GC thread stays dormant here. target_sources(jllama PRIVATE ${llama.cpp_SOURCE_DIR}/tools/server/server-context.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-queue.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-task.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-schema.cpp + ${llama.cpp_SOURCE_DIR}/tools/server/server-stream.cpp ) if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android") target_sources(jllama PRIVATE @@ -451,6 +463,7 @@ if(BUILD_TESTING) ${llama.cpp_SOURCE_DIR}/tools/server/server-task.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-schema.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-models.cpp + ${llama.cpp_SOURCE_DIR}/tools/server/server-stream.cpp ) target_include_directories(jllama_test PRIVATE diff --git a/README.md b/README.md index 1b052d81..86b51236 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ **Build:** ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) ![Platform](https://img.shields.io/badge/Platform-Linux%20%7C%20macOS%20%7C%20Windows%20%7C%20Android-lightgrey) -[![llama.cpp b9803](https://img.shields.io/badge/llama.cpp-%23b9803-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9803) +[![llama.cpp b9829](https://img.shields.io/badge/llama.cpp-%23b9829-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9829) [![JPMS](https://img.shields.io/badge/JPMS-modular%20JAR-25A162)](https://openjdk.org/projects/jigsaw/) ![JUnit](https://img.shields.io/badge/tested%20with-JUnit6-25A162) [![JSpecify](https://img.shields.io/badge/JSpecify-1.0.0%20%40NullMarked-25A162)](https://jspecify.dev) @@ -282,7 +282,7 @@ Every `net.ladenthin.llama.*` system property recognised by the library, deep-sc | `net.ladenthin.llama.test.ngl` | `43` for the general suite; `0` for `ToolCallingIntegrationTest` | test | Model-backed integration tests | Number of GPU layers used during testing. Pin to `0` on CPU-only hosts: `mvn test -Dnet.ladenthin.llama.test.ngl=0`. The tool test also selects device `none` at zero layers so Metal/CUDA is not initialized. | | `net.ladenthin.llama.tool.model` | `models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf` (test self-skips if missing) | test | `ToolCallingIntegrationTest` | Path to a tool-capable GGUF used to verify required blocking and streaming tool calls. The default matches the Qwen2.5 model in upstream llama.cpp's tool-call test matrix. | | `net.ladenthin.llama.nomic.path` | unset (test self-skips) | test | `LlamaEmbeddingsTest#testNomicEmbedLoads` | Path to a Nomic embedding model (`nomic-embed-text-v1.5.f16.gguf` or a compatible BERT-family encoder). Regression test for upstream issue #98 (BERT-encoder `result_output` assertion). | -| `net.ladenthin.llama.vision.model` | unset (test self-skips) | test | `MultimodalIntegrationTest` (upstream kherud/java-llama.cpp#103 / #34) | Path to a vision-capable model GGUF. Any vision-capable GGUF works; CI default is `SmolVLM-500M-Instruct-Q8_0.gguf`. | +| `net.ladenthin.llama.vision.model` | unset (test self-skips) | test | `MultimodalIntegrationTest` | Path to a vision-capable model GGUF. Any vision-capable GGUF works; CI default is `SmolVLM-500M-Instruct-Q8_0.gguf`. | | `net.ladenthin.llama.vision.mmproj` | unset (test self-skips) | test | `MultimodalIntegrationTest` | Matching mmproj GGUF for the vision model. | | `net.ladenthin.llama.vision.image` | `src/test/resources/images/test-image.jpg` (a CC-BY-4.0 / MIT-granted photo committed to the repo) | test | `MultimodalIntegrationTest` | Visual prompt image. Any png/jpeg/webp/gif works; the extension drives MIME detection. | | `net.ladenthin.llama.audio.model` | unset (test self-skips) | test | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | Path to an audio-input model GGUF (e.g. Ultravox, Qwen2.5-Omni). | diff --git a/docs/history/llama-cpp-breaking-changes.md b/docs/history/llama-cpp-breaking-changes.md index aaf28889..a281e549 100644 --- a/docs/history/llama-cpp-breaking-changes.md +++ b/docs/history/llama-cpp-breaking-changes.md @@ -392,3 +392,9 @@ Used during `llama.cpp` version bumps: when upgrading, scan this file from the r | b9789–b9803 | `common/arg.cpp` + `docs/speculative.md` | **New feature** — EAGLE-3 speculative decoding (`--spec-type draft-eagle3`): a small one-layer draft transformer that reads the target model's hidden states for higher acceptance; plus a new standalone `llama download` / `llama get` subcommand (`app/download.cpp`, `LLAMA_EXAMPLE_DOWNLOAD`) and a `--mtp` download flag. Server-level CLI; not surfaced by `ModelParameters`/`InferenceParameters`. Could later feed an inference-parameter setter (`--spec-type`). No project source changes required | | b9789–b9803 | `ggml/src/ggml-cuda/{binbcast,cpy}.cu` + `ggml-opencl` + `src/llama-model.{cpp,h}` + `src/models/lfm2.cpp` | Backend/model-internal only: CUDA `binbcast`/`cpy` kernels reworked for >INT_MAX index safety (int→uint32/int64 widening + overflow guards); OpenCL flushes the profiling batch on context teardown; new `LLM_TYPE_230M` mapped for LFM2 (`n_ff == 2560`). No API surface visible to `jllama.cpp`; CUDA set only affects the `cuda13-linux-x86-64` classifier, OpenCL only the `opencl-android-aarch64` classifier. No project source changes required | | b9789–b9803 | upstream verification (sandbox) | Both `patches/0001-win32-arg-parse-embed-guard.patch` (37 files) and `patches/0002-server-preserve-caller-load-progress-callback.patch` re-verified to apply cleanly against b9803 via `git apply --check` over the actual b9803 sources fetched from `raw.githubusercontent.com` (github.com git-clone is blocked in this sandbox, so a full `FetchContent` build could not run — exit 0 for both patches). Patch 0001's `common_params_parse` target region is byte-identical to b9789; the b9803 arg.cpp churn is confined to the `common_models_handler` rewrite and `set_examples` tags, which don't overlap the patched hunks. OuteTTS generator anchors hold (upstream `tts.cpp` unchanged in this range apart from patch 0001's main()-only parse flip). Full build + `ctest` to be confirmed by the CI pipeline | +| b9803–b9829 | `tools/server/server-stream.{cpp,h}` (new) + `server-context.cpp` + `server-http.{cpp,h}` + `server-models.{cpp,h}` + `server.cpp` + `CMakeLists.txt` | **Build-breaking.** Upstream added a **resumable-streaming SSE replay buffer** (PR #23226): a new TU `server-stream.cpp` defines `g_stream_sessions` (a process-wide `stream_session_manager`), `stream_session_attach_pipe()`, `stream_aware_should_stop()`, `stream_conv_id_from_headers()`, and the `stream_pipe_producer`/`stream_pipe_consumer` types. The three server TUs the project already compiles into `jllama` — `server-context.cpp`, `server-http.cpp`, `server-models.cpp` — now `#include "server-stream.h"` and reference those symbols (`server_res_generator` gained a `stop()` override + a `~server_res_generator` that calls `spipe->cleanup()`; `server_http_res` gained a `std::shared_ptr spipe` member + virtual `stop()`; `server-models` tracks a `conv_id → model` map). **Required project change:** add `${llama.cpp_SOURCE_DIR}/tools/server/server-stream.cpp` to **both** the `target_sources(jllama ...)` block and the `jllama_test` `add_executable(...)` sources in `CMakeLists.txt`, or the link fails with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits **outside** the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so the GC thread stays dormant — the resumable-stream HTTP routes are not active in the embedded library. **New feature:** resumable SSE streams (reattach after a dropped socket via `X-Conversation-Id`) could later be wired into the project's Java `OpenAiCompatServer`. | +| b9803–b9829 | `tools/server/server.cpp` + `tests/export-graph-ops.cpp` → `tests/test-export-graph-ops.cpp` (rename) (**patch 0001 targets**) | **Patch refresh.** `patches/0001-win32-arg-parse-embed-guard.patch` stopped applying for two reasons: (1) upstream **renamed** `tests/export-graph-ops.cpp` → `tests/test-export-graph-ops.cpp` (also the `llama-export-graph-ops` artifact text), so the patch's call-site-flip hunk targeted a now-missing path; (2) the resumable-stream PR inserted `g_stream_sessions.start_gc();` right after `common_init()` in `server.cpp`, shifting the context of the `common_params_parse → common_params_parse_main` flip (`@@ -82 → @@ -87`). Both hunks were regenerated against b9829 (path + index + `@@` + leading context). Patch content is otherwise unchanged; the flips remain applied-but-not-compiled here (`LLAMA_BUILD_TOOLS`/`TESTS` OFF). Patches 0002/0003/0004 apply unchanged (their target regions — `server-context.cpp` load-progress guard, the `get_meta`/`get_response_reader` area for the slot-prompt-similarity getter/setter, and `server-common.cpp`/`test-chat.cpp` — were untouched in this range). | +| b9803–b9829 | `src/models/mamba2.cpp` + `src/models/mamba-base.cpp` + `conversion/mamba.py` | Mamba2 generalized beyond a fixed expansion factor of 2: `d_in_proj` now derived from `ssm_dt_rank` + `conv_dim` (was `2*d_inner + 2*n_group*d_state + n_head`), the `GGML_ASSERT(2*n_embd == d_inner)` / `d_inner % d_state == 0` asserts removed, and `ssm_dt_b`/`ssm_a`/`ssm_d` tensor shapes keyed on `dt_rank`. Model-build internals inside upstream-compiled `libllama`; no symbol the project binds. No project source changes required | +| b9803–b9829 | `ggml/src/ggml-opencl/` (FA q4_0/q8_0 KV, +5 new kernel files) + `ggml/src/ggml-cuda/{cpy,out-prod}.cu` + `ggml/src/ggml-vulkan/` + `ggml/src/ggml-sycl/{norm,softmax}.cpp` + `ggml/src/ggml-openvino/` | Backend-internal only: OpenCL gains native flash-attention over quantized (q4_0/q8_0) KV cache + flash-decoding split kernels + Adreno X2/Xe tuning (new `fa_tune.h`, `flash_attn_pre_f16.cl`, `flash_attn_f32_q{4,8}_0.cl`, `cvt.cl`/`set_rows.cl` SoA quant variants); CUDA adds a `cudaMemcpy2DAsync` fast path for strided same-type copies, batched `cublasSgemmBatched` out-prod, and CPU→CUDA async copies; Vulkan/SYCL/OpenVINO kernel + op-table updates (incl. `GGML_GLU_OP_SWIGLU_OAI`, softmax attention-sinks). No API surface visible to `jllama.cpp`; the OpenCL set only affects the `opencl-android-aarch64` classifier, CUDA only `cuda13-linux-x86-64`. No project source changes required | +| b9803–b9829 | `common/common.{h,cpp}` + `common/speculative.cpp` + `common/arg.{cpp,h}` + `tools/mtmd/clip*.{h,cpp}` | Internal upstream churn: new `COM_*`/`SPC_*` logging macros (the `LOG_*` calls inside `common.cpp`/`speculative.cpp`/`reasoning-budget.cpp` were rewrapped, several `LOG_INF`→`LOG_TRC` quieting); `common_models_handler` gained `plan_spec`/`plan_voc` for `--spec-draft-hf`/`--hf-repo-v` downloads + duplicate-task dedup; `clip` hardened GGUF array reads (`get_arr_f32`, even-pinpoints / mean-std validation, `n_merge` defaults to 1). All consumed inside upstream-compiled `common`/`mtmd`; `grep -rn "common_models_handler\|COM_TRC\|n_merge" src/main/cpp src/test/cpp` → zero matches. No project source changes required | +| b9803–b9829 | upstream verification (sandbox) | All four patches (`0001`–`0004`) re-verified to **apply + reverse-apply cleanly** against b9829 via `git apply --check` / `git apply --reverse --check` over the actual b9829 sources fetched from `api.github.com` (github.com git-clone — incl. `FetchContent` of `nlohmann/json` and llama.cpp — is blocked in this sandbox, so a full build could not run). Patch 0001 was refreshed for the `test-export-graph-ops` rename and the `server.cpp` GC-insertion context shift (see the row above); 0002/0003/0004 unchanged. The **`server-stream.cpp` link fix** in `CMakeLists.txt` is required by the b9829 server-TU `#include`s (verified against the upstream diff: `server-context`/`server-http`/`server-models` reference symbols defined only in `server-stream.cpp`). Full build + `ctest` (target 454/454) to be confirmed by the CI pipeline. | diff --git a/patches/0001-win32-arg-parse-embed-guard.patch b/patches/0001-win32-arg-parse-embed-guard.patch index eb590a50..916a1a3b 100644 --- a/patches/0001-win32-arg-parse-embed-guard.patch +++ b/patches/0001-win32-arg-parse-embed-guard.patch @@ -267,10 +267,10 @@ index 0a75ac1..bdf13ba 100644 return 1; } -diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp -index 64cf6dc..ca382e4 100644 ---- a/tests/export-graph-ops.cpp -+++ b/tests/export-graph-ops.cpp +diff --git a/tests/test-export-graph-ops.cpp b/tests/test-export-graph-ops.cpp +index 7d8118d..88b7641 100644 +--- a/tests/test-export-graph-ops.cpp ++++ b/tests/test-export-graph-ops.cpp @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { common_init(); @@ -496,13 +496,13 @@ index f2179ed..6d958a8 100644 } if (params.out_file.empty()) { diff --git a/tools/server/server.cpp b/tools/server/server.cpp -index 4165c10..7a7ad2f 100644 +index eafef86..84c7f0b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp -@@ -82,7 +82,7 @@ int llama_server(int argc, char ** argv) { - - common_init(); - +@@ -87,7 +87,7 @@ int llama_server(int argc, char ** argv) { + // touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free + g_stream_sessions.start_gc(); + - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + if (!common_params_parse_main(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; diff --git a/patches/0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch b/patches/0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch new file mode 100644 index 00000000..d52ebdc4 --- /dev/null +++ b/patches/0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch @@ -0,0 +1,45 @@ +Upstream PR: ggml-org/llama.cpp#22393 — "server : add slot_prompt_similarity getter/setter" +https://github.com/ggml-org/llama.cpp/pull/22393 + +Carried locally until the PR is merged upstream. Adds public get/set accessors for the +server_context `slot_prompt_similarity` field so an embedding/JNI caller can query and tune +the slot-selection threshold at runtime without reloading the model. The change is purely +additive (two new accessors + their declarations) and is a verbatim copy of the upstream PR, +so it can be dropped from patches/ once b includes it. Refresh against the new source on +every llama.cpp version bump (the applier fails loud if the context shifts). + +diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp +index 39b7eb2..7c274cb 100644 +--- a/tools/server/server-context.cpp ++++ b/tools/server/server-context.cpp +@@ -3965,6 +3965,14 @@ server_response_reader server_context::get_response_reader() { + return impl->get_response_reader(); + } + ++float server_context::get_slot_prompt_similarity() const { ++ return impl->slot_prompt_similarity; ++} ++ ++void server_context::set_slot_prompt_similarity(float value) { ++ impl->slot_prompt_similarity = value; ++} ++ + server_context_meta server_context::get_meta() const { + auto bos_id = llama_vocab_bos(impl->vocab); + auto eos_id = llama_vocab_eos(impl->vocab); +diff --git a/tools/server/server-context.h b/tools/server/server-context.h +index 952f825..938c985 100644 +--- a/tools/server/server-context.h ++++ b/tools/server/server-context.h +@@ -106,6 +106,11 @@ struct server_context { + // not thread-safe, should only be used from the main thread + server_context_meta get_meta() const; + ++ // get/set the slot-prompt-similarity threshold for slot selection ++ // not thread-safe, should only be used from the main thread ++ float get_slot_prompt_similarity() const; ++ void set_slot_prompt_similarity(float value); ++ + // note: must be set before load_model() is called + void set_state_callback(server_state_callback_t callback); + }; diff --git a/patches/0004-pr23116-server-per-request-reasoning-budget-tokens.patch b/patches/0004-pr23116-server-per-request-reasoning-budget-tokens.patch new file mode 100644 index 00000000..0ac4f3b1 --- /dev/null +++ b/patches/0004-pr23116-server-per-request-reasoning-budget-tokens.patch @@ -0,0 +1,131 @@ +Upstream PR: ggml-org/llama.cpp#23116 — "server: honour per-request reasoning_budget_tokens in +chat completions" +https://github.com/ggml-org/llama.cpp/pull/23116 + +Carried locally until the PR is merged upstream. Motivated by java-llama.cpp#140: a per-request +`reasoning_budget_tokens` (and `reasoning_budget_message`) sent on a chat-completions request must +override the server-launch default. Upstream `oaicompat_chat_params_parse` only read the Anthropic +`thinking_budget_tokens` alias and always wrote the server-level `reasoning_budget_message`, so the +canonical per-request keys were ignored. The patch reads both overrides before the generic copy loop +(precedence: reasoning_budget_tokens > thinking_budget_tokens alias > server default) and threads the +per-request message through. Includes the upstream test additions (tests/test-chat.cpp) verbatim so +the patch is submittable as-is; LLAMA_BUILD_TESTS is OFF for the FetchContent subproject, so those are +applied-but-not-compiled here. Refresh against the new source on every llama.cpp version bump (the +applier fails loud if the context shifts). + +diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp +index c38aed8..dfa8006 100644 +--- a/tests/test-chat.cpp ++++ b/tests/test-chat.cpp +@@ -5780,6 +5780,71 @@ static void test_developer_role_to_system_workaround() { + } + } + ++static void test_reasoning_budget_tokens_per_request() { ++ LOG_DBG("%s\n", __func__); ++ // Use Qwen3 template which has ... reasoning markers. ++ // The autoparser detects them and sets thinking_start/end_tag, which enables ++ // the reasoning-budget code path in oaicompat_chat_params_parse. ++ auto tmpls = read_templates("models/templates/Qwen-Qwen3-0.6B.jinja"); ++ ++ server_chat_params opt; ++ opt.tmpls = std::move(tmpls); ++ opt.use_jinja = true; ++ opt.enable_thinking = true; ++ opt.reasoning_budget = -1; ++ opt.reasoning_format = COMMON_REASONING_FORMAT_NONE; ++ ++ // Body with per-request reasoning_budget_tokens=0 (suppress thinking). ++ json body = { ++ {"messages", json::array({json{{"role", "user"}, {"content", "hello"}}})}, ++ {"reasoning_budget_tokens", 0}, ++ }; ++ std::vector out_files; ++ auto llama_params = oaicompat_chat_params_parse(body, opt, out_files); ++ ++ // The per-request value must win over the server default (-1). ++ if (!llama_params.contains("reasoning_budget_tokens")) { ++ throw std::runtime_error("reasoning_budget_tokens missing from llama_params (thinking_end_tag may be empty for this template)"); ++ } ++ int got = llama_params["reasoning_budget_tokens"].get(); ++ if (got != 0) { ++ throw std::runtime_error(std::string("Expected reasoning_budget_tokens=0, got ") + std::to_string(got)); ++ } ++} ++ ++static void test_reasoning_budget_message_per_request() { ++ LOG_DBG("%s\n", __func__); ++ // Same code path as test_reasoning_budget_tokens_per_request: the Qwen3 template's ++ // ... markers enable the reasoning-budget block in oaicompat_chat_params_parse. ++ auto tmpls = read_templates("models/templates/Qwen-Qwen3-0.6B.jinja"); ++ ++ server_chat_params opt; ++ opt.tmpls = std::move(tmpls); ++ opt.use_jinja = true; ++ opt.enable_thinking = true; ++ opt.reasoning_budget = -1; ++ opt.reasoning_format = COMMON_REASONING_FORMAT_NONE; ++ opt.reasoning_budget_message = "server default"; ++ ++ // Body with a per-request reasoning_budget_message override. ++ const std::string per_request_message = "per-request message"; ++ json body = { ++ {"messages", json::array({json{{"role", "user"}, {"content", "hello"}}})}, ++ {"reasoning_budget_message", per_request_message}, ++ }; ++ std::vector out_files; ++ auto llama_params = oaicompat_chat_params_parse(body, opt, out_files); ++ ++ // The per-request value must win over the server default. ++ if (!llama_params.contains("reasoning_budget_message")) { ++ throw std::runtime_error("reasoning_budget_message missing from llama_params (thinking_end_tag may be empty for this template)"); ++ } ++ std::string got = llama_params["reasoning_budget_message"].get(); ++ if (got != per_request_message) { ++ throw std::runtime_error("Expected reasoning_budget_message='" + per_request_message + "', got '" + got + "'"); ++ } ++} ++ + static void test_msg_diffs_compute() { + LOG_DBG("%s\n", __func__); + { +@@ -5937,6 +6002,8 @@ int main(int argc, char ** argv) { + test_convert_responses_to_chatcmpl(); + test_developer_role_to_system_workaround(); + test_template_generation_prompt(); ++ test_reasoning_budget_tokens_per_request(); ++ test_reasoning_budget_message_per_request(); + test_template_output_peg_parsers(detailed_debug); + std::cout << "\n[chat] All tests passed!" << '\n'; + } +diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp +index ac291d3..26cdfd2 100644 +--- a/tools/server/server-common.cpp ++++ b/tools/server/server-common.cpp +@@ -1116,16 +1116,24 @@ json oaicompat_chat_params_parse( + + // Reasoning budget: pass parameters through to sampling layer + { +- int reasoning_budget = json_value(body, "thinking_budget_tokens", -1); ++ // Per-request overrides, read before writing to llama_params so the generic copy ++ // loop (which skips keys already present) won't clobber the caller-supplied values. ++ // Precedence: canonical reasoning_budget_tokens > Anthropic thinking_budget_tokens ++ // alias > server-level default. ++ int reasoning_budget = json_value(body, "reasoning_budget_tokens", -1); ++ if (reasoning_budget == -1) { ++ reasoning_budget = json_value(body, "thinking_budget_tokens", -1); ++ } + if (reasoning_budget == -1) { + reasoning_budget = opt.reasoning_budget; + } ++ std::string reasoning_budget_message = json_value(body, "reasoning_budget_message", opt.reasoning_budget_message); + + if (!chat_params.thinking_end_tag.empty()) { + llama_params["reasoning_budget_tokens"] = reasoning_budget; + llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag; + llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag; +- llama_params["reasoning_budget_message"] = opt.reasoning_budget_message; ++ llama_params["reasoning_budget_message"] = reasoning_budget_message; + llama_params["reasoning_control"] = json_value(body, "reasoning_control", false); + } + } diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index e94bb863..85b4dda1 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -1437,15 +1437,14 @@ JNIEXPORT jboolean JNICALL Java_net_ladenthin_llama_LlamaModel_configureParallel // slot_prompt_similarity: validated above (the [0.0, 1.0] range check still // throws for out-of-range values, preserving the existing exception - // contract). Live mutation requires an upstream setter that does not yet - // exist at b8913 — upstream PR: https://github.com/ggml-org/llama.cpp/pull/22393 - // adds server_context::set_slot_prompt_similarity(). Once that lands and - // the pinned llama.cpp version is bumped, uncomment the block below: - // - // if (slot_sim_opt.has_value()) { - // ctx_server->set_slot_prompt_similarity(*slot_sim_opt); - // } - (void)slot_sim_opt; + // contract). Live mutation uses server_context::set_slot_prompt_similarity(), + // added upstream by https://github.com/ggml-org/llama.cpp/pull/22393 and carried + // in this repo as patches/0003-pr22393-... until it merges upstream (the pinned + // llama.cpp is now b9829, which the patch applies against). not thread-safe per + // the upstream contract — main-thread only, which this JNI call is. + if (slot_sim_opt.has_value()) { + ctx_server->set_slot_prompt_similarity(*slot_sim_opt); + } return JNI_TRUE; } diff --git a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java index 647d9425..a47ee190 100644 --- a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java +++ b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java @@ -103,6 +103,11 @@ public final class InferenceParameters extends JsonParameters { private static final String PARAM_TOOLS = "tools"; private static final String PARAM_TOOL_CHOICE = "tool_choice"; private static final String PARAM_PARALLEL_TOOL_CALLS = "parallel_tool_calls"; + private static final String PARAM_DRY_MULTIPLIER = "dry_multiplier"; + private static final String PARAM_DRY_BASE = "dry_base"; + private static final String PARAM_DRY_ALLOWED_LENGTH = "dry_allowed_length"; + private static final String PARAM_DRY_PENALTY_LAST_N = "dry_penalty_last_n"; + private static final String PARAM_DRY_SEQUENCE_BREAKERS = "dry_sequence_breakers"; private static final InferenceParameters EMPTY = new InferenceParameters(); @@ -734,6 +739,83 @@ public InferenceParameters withTopNSigma(float topNSigma) { return withScalar(PARAM_TOP_N_SIGMA, topNSigma); } + /** + * Returns a new request with the per-request DRY (Don't Repeat Yourself) repetition multiplier + * replaced (default: 0.0, 0.0 = DRY disabled). DRY suppresses repeated multi-token sequences + * without the collateral damage of the classic {@code repeat_penalty}. This is the per-request + * mirror of {@link ModelParameters#setDryMultiplier(float)} (the {@code --dry-multiplier} launch + * flag); when this wither is not called, nothing is emitted and DRY stays disabled. + * + * @param dryMultiplier the DRY sampling multiplier (0.0 = disabled) + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withDryMultiplier(float dryMultiplier) { + return withScalar(PARAM_DRY_MULTIPLIER, dryMultiplier); + } + + /** + * Returns a new request with the per-request DRY base replaced (default: 1.75). The base is the + * exponential growth factor applied to the penalty as a repeated sequence lengthens; it only takes + * effect when {@link #withDryMultiplier(float)} is non-zero. Per-request mirror of + * {@link ModelParameters#setDryBase(float)} (the {@code --dry-base} launch flag). + * + * @param dryBase the DRY sampling base value + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withDryBase(float dryBase) { + return withScalar(PARAM_DRY_BASE, dryBase); + } + + /** + * Returns a new request with the per-request DRY allowed length replaced (default: 2). Sequences + * up to this length are not penalized; the penalty applies only once a repeated sequence grows + * longer. Only takes effect when {@link #withDryMultiplier(float)} is non-zero. Per-request mirror + * of {@link ModelParameters#setDryAllowedLength(int)} (the {@code --dry-allowed-length} launch flag). + * + * @param dryAllowedLength the allowed length for DRY sampling + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withDryAllowedLength(int dryAllowedLength) { + return withScalar(PARAM_DRY_ALLOWED_LENGTH, dryAllowedLength); + } + + /** + * Returns a new request with the per-request DRY penalty window replaced (default: -1, -1 = context + * size, 0 = disabled). Only takes effect when {@link #withDryMultiplier(float)} is non-zero. + * Per-request mirror of {@link ModelParameters#setDryPenaltyLastN(int)} (the + * {@code --dry-penalty-last-n} launch flag); values below {@code -1} are rejected. + * + * @param dryPenaltyLastN the DRY penalty window (-1 = context size, 0 = disabled) + * @return a new instance; this instance is unchanged + * @throws IllegalArgumentException if {@code dryPenaltyLastN} is less than {@code -1} + */ + public InferenceParameters withDryPenaltyLastN(int dryPenaltyLastN) { + if (dryPenaltyLastN < -1) { + throw new IllegalArgumentException("Invalid dry_penalty_last_n value: " + dryPenaltyLastN + + " (must be >= -1; -1 = context size, 0 = disabled)"); + } + return withScalar(PARAM_DRY_PENALTY_LAST_N, dryPenaltyLastN); + } + + /** + * Returns a new request with the per-request DRY sequence breakers replaced. Sequence breakers are + * tokens at which DRY restarts matching, so repetition is not penalized across them (llama.cpp + * default: {@code ["\n", ":", "\"", "*"]}). Empty input is a no-op (returns {@code this}), so when + * this wither is not called nothing is emitted and the server's default breakers apply. Only takes + * effect when {@link #withDryMultiplier(float)} is non-zero. + * + * @param breakers the sequence-breaker strings + * @return a new instance with the breaker array set, or {@code this} if {@code breakers} is empty + */ + public InferenceParameters withDrySequenceBreakers(String... breakers) { + if (breakers.length == 0) { + return this; + } + return withRaw( + PARAM_DRY_SEQUENCE_BREAKERS, + serializer.buildStopStrings(breakers).toString()); + } + /** * Returns a new request with the reasoning-format choice replaced. * diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp index a85dba4c..546b618b 100644 --- a/src/test/cpp/test_server.cpp +++ b/src/test/cpp/test_server.cpp @@ -1760,6 +1760,44 @@ TEST(ParamsFromJsonCmpl, EmptyDrySequenceBreakers_Throws) { EXPECT_THROW(parse_params({{"dry_sequence_breakers", json::array()}}), std::invalid_argument); } +// Happy-path DRY field parsing. Pins the contract that the JSON keys emitted by +// InferenceParameters.withDryMultiplier / withDryBase / withDryAllowedLength / +// withDryPenaltyLastN / withDrySequenceBreakers are exactly the keys +// server-schema.cpp reads into common_params_sampling. An upstream field rename +// would break the per-request DRY feature silently; these catch it at the C++ +// unit-test layer (no model / vocab required — DRY parsing is vocab-independent). +TEST(ParamsFromJsonCmpl, DryMultiplier_RoundTrip) { + const auto p = parse_params({{"dry_multiplier", 0.8f}}); + EXPECT_FLOAT_EQ(p.sampling.dry_multiplier, 0.8f); +} + +TEST(ParamsFromJsonCmpl, DryBase_AtOrAboveOne_RoundTrip) { + // 2.5 != the 1.75 default, so this proves the supplied value is stored (not defaulted) + const auto p = parse_params({{"dry_base", 2.5f}}); + EXPECT_FLOAT_EQ(p.sampling.dry_base, 2.5f); +} + +TEST(ParamsFromJsonCmpl, DryAllowedLength_RoundTrip) { + const auto p = parse_params({{"dry_allowed_length", 3}}); + EXPECT_EQ(p.sampling.dry_allowed_length, 3); +} + +TEST(ParamsFromJsonCmpl, DryPenaltyLastN_Positive_RoundTrip) { + // a positive value is kept verbatim (only -1 expands to n_ctx_slot, covered above) + const auto p = parse_params({{"dry_penalty_last_n", 64}}); + EXPECT_EQ(p.sampling.dry_penalty_last_n, 64); +} + +TEST(ParamsFromJsonCmpl, DrySequenceBreakers_NonEmpty_RoundTrip) { + // mirrors the llama.cpp default list that withDrySequenceBreakers forwards verbatim + const auto p = parse_params({{"dry_sequence_breakers", {"\n", ":", "\"", "*"}}}); + ASSERT_EQ(p.sampling.dry_sequence_breakers.size(), 4u); + EXPECT_EQ(p.sampling.dry_sequence_breakers[0], "\n"); + EXPECT_EQ(p.sampling.dry_sequence_breakers[1], ":"); + EXPECT_EQ(p.sampling.dry_sequence_breakers[2], "\""); + EXPECT_EQ(p.sampling.dry_sequence_breakers[3], "*"); +} + TEST(ParamsFromJsonCmpl, LoraNotArray_Throws) { EXPECT_THROW(parse_params({{"lora", "not-an-array"}}), std::invalid_argument); } diff --git a/src/test/java/net/ladenthin/llama/LlamaModelTest.java b/src/test/java/net/ladenthin/llama/LlamaModelTest.java index 77064082..d78c1eb8 100644 --- a/src/test/java/net/ladenthin/llama/LlamaModelTest.java +++ b/src/test/java/net/ladenthin/llama/LlamaModelTest.java @@ -122,6 +122,49 @@ public void testGenerateInfill() { assertTrue(generated > 0 && generated <= nPredict + 1); } + /** + * Per-request DRY sampling must actually reach the native sampler and alter generation. + * + *

With greedy decoding ({@code withTopK(1)}) and a fixed seed, two completions of the same + * prompt are byte-identical unless something changes the sampler. The prompt is saturated with a + * repeated multi-token n-gram, so enabling DRY with a strong multiplier and a short allowed length + * ({@code dry_penalty_last_n = -1} scans the whole context) penalizes the next token that would + * extend that n-gram — forcing the DRY run to diverge from the baseline. This exercises the + * full Java → JSON → native path for {@code withDryMultiplier} / {@code withDryBase} / + * {@code withDryAllowedLength} / {@code withDryPenaltyLastN} end to end; the per-field JSON + * round-trip is pinned deterministically by the C++ {@code ParamsFromJsonCmpl.Dry*} tests. + */ + @Test + public void testDrySamplingAltersRepetitiveGeneration() { + final String repetitivePrompt = "The cat sat. The cat sat. The cat sat. The cat sat. "; + + InferenceParameters baseline = new InferenceParameters(repetitivePrompt) + .withNPredict(24) + .withTopK(1) // greedy → deterministic given the seed + .withSeed(42) + .withDryMultiplier(0.0f); // DRY disabled (llama.cpp default) + + InferenceParameters withDry = new InferenceParameters(repetitivePrompt) + .withNPredict(24) + .withTopK(1) + .withSeed(42) + .withDryMultiplier(4.0f) + .withDryBase(1.75f) + .withDryAllowedLength(2) + .withDryPenaltyLastN(-1); + + String baselineOutput = model.complete(baseline); + String dryOutput = model.complete(withDry); + + assertNotNull(baselineOutput); + assertNotNull(dryOutput); + assertNotEquals( + baselineOutput, + dryOutput, + "DRY sampling with a strong multiplier must change greedy generation on a repetitive prompt; " + + "identical output means the dry_* fields never reached the sampler"); + } + @Test public void testGenerateGrammar() { InferenceParameters params = new InferenceParameters("") diff --git a/src/test/java/net/ladenthin/llama/MultimodalIntegrationTest.java b/src/test/java/net/ladenthin/llama/MultimodalIntegrationTest.java index 886ee70d..dc50033b 100644 --- a/src/test/java/net/ladenthin/llama/MultimodalIntegrationTest.java +++ b/src/test/java/net/ladenthin/llama/MultimodalIntegrationTest.java @@ -59,13 +59,18 @@ * author. Any image the test machine can reach works at runtime — the * URL is just an env var.

* - *

Implements the upstream vision feature requests kherud/java-llama.cpp#103 and #34.

+ *

Implements the vision feature originally requested in the pre-fork upstream repository: + * https://github.com/kherud/java-llama.cpp/issues/103 + * and + * https://github.com/kherud/java-llama.cpp/issues/34.

*/ @ClaudeGenerated( purpose = "End-to-end vision regression: real vision GGUF + mmproj + author-licensed (MIT) " + "test image fed through the typed ChatMessage(role, List) API; " + "asserts non-empty reply to prove the OAI multipart content round-trips through " - + "the upstream mtmd pipeline. Implements upstream kherud/java-llama.cpp#103 / #34.") + + "the upstream mtmd pipeline. Implements the pre-fork upstream vision requests " + + "https://github.com/kherud/java-llama.cpp/issues/103 and " + + "https://github.com/kherud/java-llama.cpp/issues/34.") public class MultimodalIntegrationTest { private static final ObjectMapper MAPPER = new ObjectMapper(); diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java index c1be993d..1e216c36 100644 --- a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java +++ b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java @@ -17,7 +17,6 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; /** @@ -36,25 +35,22 @@ * {@code --reasoning-format deepseek} at model load time causes the server to * strip the {@code } block from the response body and surface it * in {@code reasoning_content}. - *
  • {@code reasoning_budget_tokens} is NOT enforced for any model when set - * per-request. The root cause is a bug in - * {@code tools/server/server-common.cpp}, function - * {@code oaicompat_chat_params_parse}: the reasoning-budget block writes - * the model-level default ({@code opt.reasoning_budget}, typically −1) - * into {@code llama_params["reasoning_budget_tokens"]} before the generic - * copy loop runs. The copy loop then skips the per-request value from the - * request body because the key already exists - * ({@code !llama_params.contains(item.key())} is false). Result: the - * reasoning-budget sampler is never created (it requires - * {@code reasoning_budget_tokens ≥ 0}), and any per-request budget - * has no effect. Parameter serialisation itself is correct — see - * {@code InferenceParametersTest} and the C++ unit tests.
  • + *
  • {@code reasoning_budget_tokens} IS enforced per-request. This was originally + * broken in {@code tools/server/server-common.cpp} ({@code oaicompat_chat_params_parse}): + * the reasoning-budget block wrote the model-level default into + * {@code llama_params["reasoning_budget_tokens"]} before the generic copy loop, which then + * skipped the per-request value because the key already existed, so the reasoning-budget + * sampler was never created. It is fixed by upstream PR #23116, carried here as + * {@code patches/0004-pr23116-server-per-request-reasoning-budget-tokens.patch} (drop the + * patch once a pinned {@code b} includes it). With the fix, + * {@code reasoning_budget_tokens=0} suppresses thinking. Parameter serialisation is covered + * by {@code InferenceParametersTest} and the C++ unit tests.
  • * */ @ClaudeGenerated( - purpose = "Integration tests for Qwen3 thinking-mode extraction and reasoning_budget_tokens " - + "parameter acceptance. Documents the known llama.cpp limitation that budget " - + "enforcement does not work for prompt-injected thinking models.") + purpose = "Integration tests for Qwen3 thinking-mode extraction and per-request " + + "reasoning_budget_tokens enforcement (fixed via patches/0004, upstream PR #23116): " + + "budget=0 suppresses thinking.") public class ReasoningBudgetTest { /** @@ -123,95 +119,37 @@ public void testThinkingDefault_reasoningContentAndAnswerPresent() { } /** - * {@code reasoning_budget_tokens=0} is accepted by the API and the response - * completes without error, but the budget is NOT enforced. + * Per-request {@code reasoning_budget_tokens=0} suppresses thinking: the model emits an + * empty {@code reasoning_content}. * - *

    Documents current (broken) behaviour. The per-request value is - * silently discarded by a bug in {@code tools/server/server-common.cpp} - * ({@code oaicompat_chat_params_parse}): the reasoning-budget block writes the - * model-level default (−1) to {@code llama_params["reasoning_budget_tokens"]} - * before the generic copy loop runs, and the copy loop then skips the user value - * because the key already exists. The reasoning-budget sampler is therefore never - * created, and {@code reasoning_content} remains non-empty. + *

    The per-request budget is honored by upstream + * llama.cpp PR #23116, carried + * in this repo as {@code patches/0004-pr23116-server-per-request-reasoning-budget-tokens.patch} + * until a pinned {@code b} includes it. Before that fix, + * {@code oaicompat_chat_params_parse} ({@code tools/server/server-common.cpp}) wrote the + * model-level default into {@code llama_params["reasoning_budget_tokens"]} before the generic + * copy loop, so the per-request value was dropped and the reasoning-budget sampler was never + * created. With the fix, {@code budget=0} forces the end-of-thinking sequence immediately. * - *

    This assertion will start failing once the llama.cpp bug is fixed — - * that is the signal to remove this test and enable - * {@link #testReasoningBudgetZero_expectedBehavior_suppressesThinking}. - * Tracked in llama.cpp PR #23116. - * - *

    {@code temperature=0} (greedy sampling) is used so the model deterministically - * enters the {@code } block on every platform. Without it, Metal (macOS arm64) - * occasionally samples a non-thinking first token even when the budget is unlimited - * (due to the bug), causing a spurious test failure. + *

    {@code temperature=0} (greedy) keeps the first-token choice deterministic across + * platforms (notably macOS Metal), so the result does not depend on sampling. Parameter + * serialisation is covered separately by {@code InferenceParametersTest} and the C++ unit tests. */ @Test - public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() { + public void testReasoningBudgetZero_suppressesThinking() { InferenceParameters params = new InferenceParameters("") .withMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?"))) .withTemperature(0.0f) .withReasoningBudgetTokens(0) .withNPredict(N_PREDICT); - String json = model.chatComplete(params); - - assertNotNull(json, "Response JSON must not be null"); - - String reasoningContent = parser.extractChoiceReasoningContent(json); - assertFalse( - reasoningContent == null || reasoningContent.trim().isEmpty(), - "reasoning_content is expected to be present because the per-request " - + "budget is not applied (llama.cpp server-common.cpp copy-loop bug). " - + "If this assertion fails, the bug has been fixed — remove this test and " - + "enable testReasoningBudgetZero_expectedBehavior_suppressesThinking."); - } - - /** - * Expected correct behaviour after the llama.cpp bug is fixed. - * - *

    Bug: In {@code tools/server/server-common.cpp}, - * {@code oaicompat_chat_params_parse} sets - * {@code llama_params["reasoning_budget_tokens"]} to the model-level default - * ({@code opt.reasoning_budget}, typically −1) before the generic copy - * loop runs. The copy loop then skips the per-request value from the request - * body because the key already exists. Result: the sampler is never created - * ({@code reasoning_budget_tokens ≥ 0} is required), and budget=0 - * has no effect. - * - *

    Fix (server-common.cpp, reasoning budget block): - * Read {@code reasoning_budget_tokens} from the request body before - * writing to {@code llama_params}: - *

    -     * int reasoning_budget = opt.reasoning_budget;
    -     * if (body.contains("reasoning_budget_tokens")) {
    -     *     reasoning_budget = json_value(body, "reasoning_budget_tokens", reasoning_budget);
    -     * }
    -     * if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) {
    -     *     reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
    -     * }
    -     * 
    - * - *

    Once this fix is applied: remove {@code @Ignore}, confirm this test passes, - * and remove - * {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed}. - * Tracked in llama.cpp PR #23116. - */ - @Disabled("llama.cpp bug: per-request reasoning_budget_tokens is overwritten by model default " - + "in oaicompat_chat_params_parse (server-common.cpp). " - + "See Javadoc for exact fix location and code.") - @Test - public void testReasoningBudgetZero_expectedBehavior_suppressesThinking() { - InferenceParameters params = new InferenceParameters("") - .withMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?"))) - .withReasoningBudgetTokens(0) - .withNPredict(N_PREDICT); - String json = model.chatComplete(params); assertNotNull(json, "Response JSON must not be null"); String reasoningContent = parser.extractChoiceReasoningContent(json); assertTrue( reasoningContent == null || reasoningContent.trim().isEmpty(), - "reasoning_content should be empty when budget=0 suppresses thinking, " + "but was: " + "reasoning_content must be empty when reasoning_budget_tokens=0 suppresses thinking, " + "but was: " + reasoningContent); } @@ -224,8 +162,9 @@ public void testReasoningBudgetZero_expectedBehavior_suppressesThinking() { * model may exhaust the token budget inside the thinking block and emit an empty * {@code content}; checking both fields makes the test robust to that behaviour. * - *

    See {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed} for - * the note on why the budget count itself is not asserted. + *

    The exact number of thinking tokens consumed is not asserted — it is hardware- and + * sampling-dependent; {@link #testReasoningBudgetZero_suppressesThinking} covers the + * deterministic {@code budget=0} suppression case. */ @Test public void testReasoningBudgetPositive_parameterAccepted() { diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java index 2f5809ee..883d91c4 100644 --- a/src/test/java/net/ladenthin/llama/TestConstants.java +++ b/src/test/java/net/ladenthin/llama/TestConstants.java @@ -46,7 +46,7 @@ public class TestConstants { /** * System property holding a path to a vision-capable model GGUF. Consumed by - * {@code MultimodalIntegrationTest} (upstream kherud/java-llama.cpp#103 / #34). The CI default is the + * {@code MultimodalIntegrationTest}. The CI default is the * SmolVLM-500M Q8_0 GGUF; the test self-skips when the property is unset or * the file is missing. */ diff --git a/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java b/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java index 01c2b94c..0faa2626 100644 --- a/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java +++ b/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java @@ -727,4 +727,84 @@ public void testSetTokenIdBiasMultiple() { assertThat(value, containsString("1")); assertThat(value, containsString("2")); } + + // ------------------------------------------------------------------------- + // DRY (Don't Repeat Yourself) sampling + // ------------------------------------------------------------------------- + + @Test + public void testSetDryMultiplier() { + InferenceParameters params = new InferenceParameters("").withDryMultiplier(0.8f); + assertThat(params.parameters.get("dry_multiplier"), is("0.8")); + } + + @Test + public void testSetDryBase() { + InferenceParameters params = new InferenceParameters("").withDryBase(1.75f); + assertThat(params.parameters.get("dry_base"), is("1.75")); + } + + @Test + public void testSetDryAllowedLength() { + InferenceParameters params = new InferenceParameters("").withDryAllowedLength(2); + assertThat(params.parameters.get("dry_allowed_length"), is("2")); + } + + @Test + public void testSetDryPenaltyLastN() { + InferenceParameters params = new InferenceParameters("").withDryPenaltyLastN(-1); + assertThat(params.parameters.get("dry_penalty_last_n"), is("-1")); + } + + @Test + public void testSetDryPenaltyLastNDisabled() { + InferenceParameters params = new InferenceParameters("").withDryPenaltyLastN(0); + assertThat(params.parameters.get("dry_penalty_last_n"), is("0")); + } + + @Test + public void testSetDryPenaltyLastNBelowMinusOneRejected() { + InferenceParameters params = new InferenceParameters(""); + assertThrows(IllegalArgumentException.class, () -> params.withDryPenaltyLastN(-2)); + } + + @Test + public void testSetDrySequenceBreakersSingle() { + InferenceParameters params = new InferenceParameters("").withDrySequenceBreakers("\n"); + assertThat(params.parameters.get("dry_sequence_breakers"), is("[\"\\n\"]")); + } + + @Test + public void testSetDrySequenceBreakersMultiple() { + InferenceParameters params = new InferenceParameters("").withDrySequenceBreakers("\n", ":", "\"", "*"); + assertThat(params.parameters.get("dry_sequence_breakers"), is("[\"\\n\",\":\",\"\\\"\",\"*\"]")); + } + + @Test + public void testSetDrySequenceBreakersEmpty() { + InferenceParameters params = new InferenceParameters(""); + InferenceParameters result = params.withDrySequenceBreakers(); + assertThat(params.parameters, not(hasKey("dry_sequence_breakers"))); + assertThat(result, is(sameInstance(params))); + } + + @Test + public void testDryDefaultsEmitNothing() { + InferenceParameters params = new InferenceParameters("prompt"); + assertThat(params.parameters, not(hasKey("dry_multiplier"))); + assertThat(params.parameters, not(hasKey("dry_base"))); + assertThat(params.parameters, not(hasKey("dry_allowed_length"))); + assertThat(params.parameters, not(hasKey("dry_penalty_last_n"))); + assertThat(params.parameters, not(hasKey("dry_sequence_breakers"))); + } + + @Test + public void testDryWithersReturnNewInstance() { + InferenceParameters params = new InferenceParameters(""); + assertThat(params.withDryMultiplier(0.8f), is(not(sameInstance(params)))); + assertThat(params.withDryBase(1.75f), is(not(sameInstance(params)))); + assertThat(params.withDryAllowedLength(2), is(not(sameInstance(params)))); + assertThat(params.withDryPenaltyLastN(-1), is(not(sameInstance(params)))); + assertThat(params.withDrySequenceBreakers("\n"), is(not(sameInstance(params)))); + } }