bernardladenthin · bernardladenthin · Jun 28, 2026 · Jun 28, 2026
@@ -19,6 +19,7 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags.
 - Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`.
 - Per-request DRY sampling to `InferenceParameters` (`dry_multiplier`/`dry_base`/`dry_allowed_length`/`dry_penalty_last_n`/`dry_sequence_breakers`).
+- `ModelParameters.enableSwaFull()` (`--swa-full`): keep full-size SWA KV cache to enable cross-request prompt-prefix reuse.
 - Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`.
 - Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server.
 

@@ -22,6 +22,11 @@ public enum ModelFlag {
     /** Enable Flash Attention. */
     FLASH_ATTN("--flash-attn"),
 
+    /** Keep the full-size sliding-window-attention (SWA) KV cache, enabling cross-request
+     *  prompt-prefix reuse (pairs with --cache-reuse) at ~2x the SWA-layer KV RAM. Default off.
+     *  Env: LLAMA_ARG_SWA_FULL. */
+    SWA_FULL("--swa-full"),
+
     /** Disable internal libllama performance timings. */
     NO_PERF("--no-perf"),
 

@@ -255,6 +255,17 @@ public ModelParameters enableFlashAttn() {
         return setFlag(ModelFlag.FLASH_ATTN);
     }
 
+    /**
+     * Use the full-size SWA KV cache so the sliding-window layers' KV is reusable across requests
+     * (restores prompt-prefix cache reuse with {@link #setCacheReuse(int)}); costs ~2x SWA-layer
+     * KV RAM. Off by default; only beneficial for multi-request sessions sharing a prompt prefix.
+     *
+     * @return this builder
+     */
+    public ModelParameters enableSwaFull() {
+        return setFlag(ModelFlag.SWA_FULL);
+    }
+
     /**
      * Disable internal libllama performance timings (default: false).
      *

@@ -19,6 +19,7 @@ public static Collection<Object[]> data() {
         return Arrays.asList(new Object[][] {
             {ModelFlag.NO_CONTEXT_SHIFT, "--no-context-shift"},
             {ModelFlag.FLASH_ATTN, "--flash-attn"},
+            {ModelFlag.SWA_FULL, "--swa-full"},
             {ModelFlag.NO_PERF, "--no-perf"},
             {ModelFlag.ESCAPE, "--escape"},
             {ModelFlag.NO_ESCAPE, "--no-escape"},
@@ -66,7 +67,7 @@ public void testGetCliFlag(ModelFlag flag, String expectedCliFlag) {
 
     @Test
     public void testEnumCount() {
-        assertEquals(34, ModelFlag.values().length);
+        assertEquals(35, ModelFlag.values().length);
     }
 
     @ParameterizedTest(name = "{0} -> {1}")

@@ -641,6 +641,18 @@
         assertThat(p.parameters.get("--flash-attn"), is(nullValue()));
     }
 
+    @Test
+    public void testEnableSwaFull() {
+        ModelParameters p = new ModelParameters().enableSwaFull();
+        assertThat(p.parameters, hasKey("--swa-full"));
+        assertThat(p.parameters.get("--swa-full"), is(nullValue()));
+    }
+
+    @Test
+    public void testSwaFullNotEnabledByDefault() {
+        assertThat(new ModelParameters().parameters, not(hasKey("--swa-full")));
+    }
+
     @Test
     public void testDisablePerf() {
         ModelParameters p = new ModelParameters().disablePerf();