Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
- Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags.
- Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`.
- Per-request DRY sampling to `InferenceParameters` (`dry_multiplier`/`dry_base`/`dry_allowed_length`/`dry_penalty_last_n`/`dry_sequence_breakers`).
- `ModelParameters.enableSwaFull()` (`--swa-full`): keep full-size SWA KV cache to enable cross-request prompt-prefix reuse.
- Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`.
- Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server.

Expand Down
5 changes: 5 additions & 0 deletions src/main/java/net/ladenthin/llama/args/ModelFlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ public enum ModelFlag {
/** Enable Flash Attention. */
FLASH_ATTN("--flash-attn"),

/** Keep the full-size sliding-window-attention (SWA) KV cache, enabling cross-request
* prompt-prefix reuse (pairs with --cache-reuse) at ~2x the SWA-layer KV RAM. Default off.
* Env: LLAMA_ARG_SWA_FULL. */
SWA_FULL("--swa-full"),

/** Disable internal libllama performance timings. */
NO_PERF("--no-perf"),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,17 @@ public ModelParameters enableFlashAttn() {
return setFlag(ModelFlag.FLASH_ATTN);
}

/**
* Use the full-size SWA KV cache so the sliding-window layers' KV is reusable across requests
* (restores prompt-prefix cache reuse with {@link #setCacheReuse(int)}); costs ~2x SWA-layer
* KV RAM. Off by default; only beneficial for multi-request sessions sharing a prompt prefix.
*
* @return this builder
*/
public ModelParameters enableSwaFull() {
return setFlag(ModelFlag.SWA_FULL);
}

/**
* Disable internal libllama performance timings (default: false).
*
Expand Down
3 changes: 2 additions & 1 deletion src/test/java/net/ladenthin/llama/args/ModelFlagTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public static Collection<Object[]> data() {
return Arrays.asList(new Object[][] {
{ModelFlag.NO_CONTEXT_SHIFT, "--no-context-shift"},
{ModelFlag.FLASH_ATTN, "--flash-attn"},
{ModelFlag.SWA_FULL, "--swa-full"},
{ModelFlag.NO_PERF, "--no-perf"},
{ModelFlag.ESCAPE, "--escape"},
{ModelFlag.NO_ESCAPE, "--no-escape"},
Expand Down Expand Up @@ -66,7 +67,7 @@ public void testGetCliFlag(ModelFlag flag, String expectedCliFlag) {

@Test
public void testEnumCount() {
assertEquals(34, ModelFlag.values().length);
assertEquals(35, ModelFlag.values().length);
}

@ParameterizedTest(name = "{0} -> {1}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,18 @@
assertThat(p.parameters.get("--flash-attn"), is(nullValue()));
}

@Test
public void testEnableSwaFull() {

Check warning on line 645 in src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove this 'public' modifier.

See more on https://sonarcloud.io/project/issues?id=bernardladenthin_java-llama.cpp&issues=AZ8OMBwOIHStKnsFnpF1&open=AZ8OMBwOIHStKnsFnpF1&pullRequest=275
ModelParameters p = new ModelParameters().enableSwaFull();
assertThat(p.parameters, hasKey("--swa-full"));
assertThat(p.parameters.get("--swa-full"), is(nullValue()));
}

@Test
public void testSwaFullNotEnabledByDefault() {

Check warning on line 652 in src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove this 'public' modifier.

See more on https://sonarcloud.io/project/issues?id=bernardladenthin_java-llama.cpp&issues=AZ8OMBwOIHStKnsFnpF2&open=AZ8OMBwOIHStKnsFnpF2&pullRequest=275
assertThat(new ModelParameters().parameters, not(hasKey("--swa-full")));
}

@Test
public void testDisablePerf() {
ModelParameters p = new ModelParameters().disablePerf();
Expand Down
Loading