bernardladenthin · bernardladenthin · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
@@ -9,12 +9,14 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")
 
 setlocal enabledelayedexpansion
 
-set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
+REM Every CI Java test job (incl. Windows) now downloads the full model set before
+REM validating and runs the embedding / vision / TTS integration tests, so all of
+REM these are REQUIRED (a missing one is a hard failure, not a silent self-skip).
+set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" "models\nomic-embed-text-v1.5.f16.gguf" "models\SmolVLM-500M-Instruct-Q8_0.gguf" "models\mmproj-SmolVLM-500M-Instruct-Q8_0.gguf" "models\OuteTTS-0.2-500M-Q4_K_M.gguf" "models\WavTokenizer-Large-75-F16.gguf"
 
-REM Vision GGUFs are validated only when present (the Windows job downloads
-REM them too, but the validation step must not fail when a future job opts
-REM out of the vision matrix).
-set "OPTIONAL_MODELS=models\SmolVLM-500M-Instruct-Q8_0.gguf" "models\mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
+REM No optional models remain (the audio-input model has no CI download and its
+REM test self-skips). Left empty so the optional loop below is a no-op.
+set "OPTIONAL_MODELS="
 
 echo Validating required model files...
 for %%M in (%MODELS%) do (

@@ -10,26 +10,31 @@
 
 set -e
 
+# Every CI Java test job (Linux + all macOS + all Windows) now downloads the full
+# model set before validating, and runs the embedding / vision / TTS integration
+# tests with their properties set — so all of these are REQUIRED, not optional. A
+# missing model is a hard failure here (it would otherwise let an integration test
+# silently self-skip). See .github/workflows/publish.yml.
 MODELS=(
   "models/codellama-7b.Q2_K.gguf"
   "models/jina-reranker-v1-tiny-en-Q4_0.gguf"
   "models/AMD-Llama-135m-code.Q2_K.gguf"
   "models/Qwen3-0.6B-Q4_K_M.gguf"
   "models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
-)
-
-# Optional GGUFs validated only when present so jobs that do not download
-# them (e.g. cross-compile smoke runs) still pass. The vision test image is
-# committed to src/test/resources/images/test-image.jpg and is not validated
-# here — its presence is asserted directly by MultimodalIntegrationTest.
-OPTIONAL_MODELS=(
   "models/nomic-embed-text-v1.5.f16.gguf"
   "models/SmolVLM-500M-Instruct-Q8_0.gguf"
   "models/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
   "models/OuteTTS-0.2-500M-Q4_K_M.gguf"
   "models/WavTokenizer-Large-75-F16.gguf"
 )
 
+# Optional GGUFs validated only when present. The vision test image is committed to
+# src/test/resources/images/test-image.jpg and is not validated here — its presence
+# is asserted directly by MultimodalIntegrationTest. The audio-input model
+# (AudioInputIntegrationTest) has no committed clip and no CI download, so that test
+# self-skips and its model is intentionally not listed here.
+OPTIONAL_MODELS=()
+
 validate_gguf() {
   local model="$1"
   local required="$2"

@@ -18,6 +18,7 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - End-to-end vision input across blocking, typed `ChatRequest`, streaming, and OpenAI-compatible request mapping; real-model tests verify that distinct red and blue images produce the correct semantic answers.
 - Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags.
 - Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`.
+- Per-request DRY sampling to `InferenceParameters` (`dry_multiplier`/`dry_base`/`dry_allowed_length`/`dry_penalty_last_n`/`dry_sequence_breakers`).
 - Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`.
 - Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server.
 
@@ -27,9 +28,12 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - README license badge corrected from "Apache 2.0" to "MIT" (matches `LICENSE` file and `pom.xml`).
 - `pom.xml` SCM URL: `tree/master` → `tree/main` (default branch renamed).
 - Upgraded llama.cpp from b9151 to b9172.
+- Upgraded llama.cpp from b9803 to b9829. Compiles the new upstream `server-stream.cpp` (resumable-streaming SSE replay buffer) into `libjllama`, required because `server-context`/`server-http`/`server-models` now reference its symbols; refreshed `patches/0001` for the `tests/test-export-graph-ops.cpp` rename and the `server.cpp` GC-init context shift.
+- `configureParallelInference` now applies `slot_prompt_similarity` live via `server_context::set_slot_prompt_similarity()` (upstream PR ggml-org/llama.cpp#22393, carried as `patches/0003` until merged), instead of validating it and discarding the value.
 - Extracted the `chatWithTools` agent loop into `ToolCallingAgent`; tool-result errors (unknown tool / handler exception) are now JSON-serialized so tool names containing special characters remain valid JSON.
 
 ### Fixed
+- Per-request `reasoning_budget_tokens` is now honored (via `patches/0004`, upstream PR ggml-org/llama.cpp#23116): `reasoning_budget_tokens=0` suppresses thinking. `ReasoningBudgetTest` now asserts the suppression directly (the previous test that pinned the unfixed-bug behavior was removed).
 - Preserved decoded image buffers across the JNI chat boundary and submitted media requests through llama.cpp's upstream multimodal task path instead of silently tokenizing them as text-only prompts.
 - Preserved multipart image content when using the typed `ChatRequest` serializer.
 - The standalone OpenAI-compatible server now advertises vision only when the loaded model confirms usable vision support.