intentee · mcharytoniuk · Jun 10, 2026 · Jun 9, 2026 · Jun 10, 2026
diff --git a/.claude/.gitignore b/.claude/.gitignore
@@ -1 +1,2 @@
 *.lock
+/workflows/
diff --git a/.claude/rules/subproject-llama-cpp-bindings-sys.md b/.claude/rules/subproject-llama-cpp-bindings-sys.md
@@ -0,0 +1,10 @@
+---
+paths:
+  - "llama-cpp-bindings-sys/**"
+---
+
+# `llama-cpp-bindings-sys` Context
+
+- Every CPP exception MUST be surfaced to the Rust side of the project.
+- If a CPP issue can be precisely identified, and mapped into an enum on the Rust side, it must be mapped.
+- CPP bindings must remain minimal wrappers over `llama.cpp` API. Every logic possible must be moved to Rust, and be unit testable.
diff --git a/.claude/rules/subproject-llama-cpp-bindings-types.md b/.claude/rules/subproject-llama-cpp-bindings-types.md
@@ -0,0 +1,9 @@
+---
+paths:
+  - "llama-cpp-bindings-types/**"
+---
+
+# `llama-cpp-bindings-types` Context
+
+- The purposse of `llama-cpp-bindings-types` is to provide a thin layer of types that do not need to rely on `llama.cpp` vendored library itself
+- `llama-cpp-bindings-types` must not depend on llama.cpp bindings themselves
diff --git a/.claude/rules/subproject-llama-cpp-test-harness.md b/.claude/rules/subproject-llama-cpp-test-harness.md
@@ -0,0 +1,11 @@
+---
+paths:
+  - "llama-cpp-test-harness/**"
+  - "llama-cpp-test-harness-macros/**"
+---
+
+# `llama-cpp-test-harness` Context
+
+- The purpose of `llama-cpp-test-harness` is to provide a custom harness that optimizes the tests to minimize model swaps.
+- It must analyze all the relevant test attributes, and plan the execution to minimize the model swaps
+- It needs to group the tests by model type they depend on, and execute them in phases (where each phase represents a different model)
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
   "llama-cpp-bindings-types",
   "llama-cpp-bindings",
   "llama-cpp-bindings-tests",
+  "llama-cpp-error-recorder",
   "llama-cpp-log-decoder",
   "llama-cpp-test-harness",
   "llama-cpp-test-harness-macros",
@@ -33,6 +34,7 @@ llama-cpp-bindings = { path = "llama-cpp-bindings", version = "=0.8.0" }
 llama-cpp-bindings-build = { path = "llama-cpp-bindings-build", version = "=0.8.0" }
 llama-cpp-bindings-sys = { path = "llama-cpp-bindings-sys", version = "=0.8.0" }
 llama-cpp-bindings-types = { path = "llama-cpp-bindings-types", version = "=0.8.0" }
+llama-cpp-error-recorder = { path = "llama-cpp-error-recorder", version = "=0.8.0" }
 llama-cpp-log-decoder = { path = "llama-cpp-log-decoder", version = "=0.8.0" }
 llama-cpp-test-harness = { path = "llama-cpp-test-harness", version = "=0.8.0" }
 llama-cpp-test-harness-macros = { path = "llama-cpp-test-harness-macros", version = "=0.8.0" }

diff --git a/Makefile b/Makefile
@@ -26,7 +26,8 @@ coverage: node_modules
 	cargo llvm-cov report
 	npx rust-coverage-check target/llvm-cov.json \
 		--workspace-root $(CURDIR) \
-		--gated llama-cpp-bindings=95 \
+		--gated llama-cpp-bindings=98 \
+		--gated llama-cpp-error-recorder=100 \
 		--gated llama-cpp-log-decoder=100 \
 		--gated llama-cpp-bindings-types=100 \
 		--gated llama-cpp-test-harness=99 \

diff --git a/llama-cpp-bindings-sys/wrapper.h b/llama-cpp-bindings-sys/wrapper.h
@@ -1,5 +1,6 @@
 #include "llama.cpp/include/llama.h"
 #include "llama.cpp/ggml/include/gguf.h"
+#include "wrapper_chat_apply.h"
 #include "wrapper_chat_parse.h"
 #include "wrapper_common.h"
 #include "wrapper_fit.h"

diff --git a/llama-cpp-bindings-sys/wrapper_chat_apply.cpp b/llama-cpp-bindings-sys/wrapper_chat_apply.cpp
@@ -0,0 +1,96 @@
+#include "wrapper_chat_apply.h"
+#include "wrapper_token_text.h"
+
+#include "llama.cpp/common/chat-auto-parser.h"
+#include "llama.cpp/common/chat.h"
+#include "llama.cpp/include/llama.h"
+
+#include <exception>
+#include <new>
+#include <nlohmann/json.hpp>
+#include <string>
+
+using wrapper_helpers::token_text_or_empty;
+
+extern "C" llama_rs_apply_chat_template_status llama_rs_apply_chat_template(
+    const struct llama_model * model,
+    const char * template_src,
+    const char * const * roles,
+    const char * const * contents,
+    size_t n_messages,
+    int add_generation_prompt,
+    char ** out_string,
+    char ** out_error) {
+    if (out_string) {
+        *out_string = nullptr;
+    }
+    if (out_error) {
+        *out_error = nullptr;
+    }
+    if (!model) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MODEL_ARG;
+    }
+    if (!template_src) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_TEMPLATE_ARG;
+    }
+    if (n_messages > 0 && (!roles || !contents)) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MESSAGES_ARG;
+    }
+    if (!out_string) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_STRING_ARG;
+    }
+    if (!out_error) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_ERROR_ARG;
+    }
+
+    try {
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        if (!vocab) {
+            return LLAMA_RS_APPLY_CHAT_TEMPLATE_MODEL_HAS_NO_VOCAB;
+        }
+
+        std::string bos_token = token_text_or_empty(vocab, llama_vocab_bos(vocab));
+        std::string eos_token = token_text_or_empty(vocab, llama_vocab_eos(vocab));
+
+        common_chat_template tmpl(template_src, bos_token, eos_token);
+
+        nlohmann::ordered_json messages = nlohmann::ordered_json::array();
+        for (size_t index = 0; index < n_messages; index++) {
+            messages.push_back({
+                { "role", roles[index] ? roles[index] : "" },
+                { "content", contents[index] ? contents[index] : "" },
+            });
+        }
+
+        autoparser::generation_params inputs;
+        inputs.messages              = std::move(messages);
+        inputs.tools                 = nlohmann::ordered_json::array();
+        inputs.add_generation_prompt = add_generation_prompt != 0;
+
+        std::string rendered = common_chat_template_direct_apply(tmpl, inputs);
+        if (rendered.empty()) {
+            return LLAMA_RS_APPLY_CHAT_TEMPLATE_TEMPLATE_APPLICATION_FAILED;
+        }
+
+        *out_string = llama_rs_dup_string(rendered);
+        if (!*out_string) {
+            return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
+        }
+
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_OK;
+    } catch (const std::bad_alloc &) {
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
+    } catch (const std::exception & ex) {
+        *out_error = llama_rs_dup_string(std::string(ex.what()));
+        if (!*out_error) {
+            return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
+        }
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION;
+    } catch (...) {
+        *out_error = llama_rs_dup_string(std::string("unknown c++ exception"));
+        if (!*out_error) {
+            return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
+        }
+        return LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION;
+    }
+}
diff --git a/llama-cpp-bindings-sys/wrapper_chat_apply.h b/llama-cpp-bindings-sys/wrapper_chat_apply.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "llama.cpp/include/llama.h"
+#include "wrapper_utils.h"
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum llama_rs_apply_chat_template_status {
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_OK = 0,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MODEL_ARG,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_TEMPLATE_ARG,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MESSAGES_ARG,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_STRING_ARG,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_ERROR_ARG,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_MODEL_HAS_NO_VOCAB,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_TEMPLATE_APPLICATION_FAILED,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED,
+    LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION,
+} llama_rs_apply_chat_template_status;
+
+llama_rs_apply_chat_template_status llama_rs_apply_chat_template(
+    const struct llama_model * model,
+    const char * template_src,
+    const char * const * roles,
+    const char * const * contents,
+    size_t n_messages,
+    int add_generation_prompt,
+    char ** out_string,
+    char ** out_error);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama-cpp-bindings/fixtures/llamas.jpg → llama-cpp-bindings-tests/fixtures/llamas.jpg b/llama-cpp-bindings/fixtures/llamas.jpg → llama-cpp-bindings-tests/fixtures/llamas.jpg
diff --git a/llama-cpp-bindings-tests/fixtures/orange_cat.wav b/llama-cpp-bindings-tests/fixtures/orange_cat.wav
diff --git a/llama-cpp-bindings-tests/fixtures/quick_brown_fox.wav b/llama-cpp-bindings-tests/fixtures/quick_brown_fox.wav
diff --git a/llama-cpp-bindings-tests/src/build_user_prompt_with_media_marker.rs b/llama-cpp-bindings-tests/src/build_user_prompt_with_media_marker.rs
@@ -0,0 +1,16 @@
+use anyhow::Result;
+use llama_cpp_bindings::model::LlamaChatMessage;
+use llama_cpp_bindings::model::LlamaModel;
+use llama_cpp_bindings::mtmd::mtmd_default_marker;
+
+/// # Errors
+///
+/// Forwards chat-template lookup, message construction, and template application errors.
+pub fn build_user_prompt_with_media_marker(model: &LlamaModel, question: &str) -> Result<String> {
+    let marker = mtmd_default_marker()?;
+    let user_content = format!("{marker}{question}");
+    let chat_template = model.chat_template(None)?;
+    let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
+
+    Ok(model.apply_chat_template(&chat_template, &messages, true)?)
+}
diff --git a/llama-cpp-bindings-tests/src/chunk_token_breakdown.rs b/llama-cpp-bindings-tests/src/chunk_token_breakdown.rs
@@ -0,0 +1,36 @@
+use anyhow::Context;
+use anyhow::Result;
+use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+use llama_cpp_bindings::mtmd::MtmdInputChunks;
+
+pub struct ChunkTokenBreakdown {
+    pub text: u64,
+    pub image: u64,
+    pub audio: u64,
+}
+
+impl ChunkTokenBreakdown {
+    /// # Errors
+    ///
+    /// Forwards chunk access and chunk-type classification errors.
+    pub fn from_chunks(chunks: &MtmdInputChunks) -> Result<Self> {
+        let mut breakdown = Self {
+            text: 0,
+            image: 0,
+            audio: 0,
+        };
+        for index in 0..chunks.len() {
+            let chunk = chunks
+                .get(index)
+                .with_context(|| format!("chunk index {index} is missing"))?;
+            let n_tokens = u64::try_from(chunk.n_tokens())?;
+            match chunk.chunk_type()? {
+                MtmdInputChunkType::Text => breakdown.text += n_tokens,
+                MtmdInputChunkType::Image => breakdown.image += n_tokens,
+                MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
+            }
+        }
+
+        Ok(breakdown)
+    }
+}
diff --git a/llama-cpp-bindings-tests/src/classify_sample_loop.rs b/llama-cpp-bindings-tests/src/classify_sample_loop.rs
@@ -129,4 +129,50 @@ mod tests {
         assert_eq!(outcome.observed_reasoning, 0);
         assert_eq!(outcome.observed_undeterminable, 0);
     }
+
+    #[test]
+    fn record_outcome_reasoning_token_streams_visible_piece() {
+        let ingest = IngestOutcome {
+            sampled_token: SampledToken::Reasoning(LlamaToken(7)),
+            visible_piece: "thinking".to_string(),
+            raw_piece: String::new(),
+        };
+        let mut outcome = ClassifySampleLoopOutcome::default();
+
+        record_outcome(&ingest, &mut outcome, false);
+
+        assert_eq!(outcome.observed_reasoning, 1);
+        assert_eq!(outcome.reasoning_stream, "thinking");
+    }
+
+    #[test]
+    fn record_outcome_reasoning_token_at_end_of_generation_is_not_streamed() {
+        let ingest = IngestOutcome {
+            sampled_token: SampledToken::Reasoning(LlamaToken(7)),
+            visible_piece: "thinking".to_string(),
+            raw_piece: String::new(),
+        };
+        let mut outcome = ClassifySampleLoopOutcome::default();
+
+        record_outcome(&ingest, &mut outcome, true);
+
+        assert_eq!(outcome.observed_reasoning, 1);
+        assert!(outcome.reasoning_stream.is_empty());
+    }
+
+    #[test]
+    fn record_outcome_undeterminable_token_counts_without_streaming() {
+        let ingest = IngestOutcome {
+            sampled_token: SampledToken::Undeterminable(LlamaToken(9)),
+            visible_piece: "ignored".to_string(),
+            raw_piece: String::new(),
+        };
+        let mut outcome = ClassifySampleLoopOutcome::default();
+
+        record_outcome(&ingest, &mut outcome, false);
+
+        assert_eq!(outcome.observed_undeterminable, 1);
+        assert!(outcome.content_stream.is_empty());
+        assert!(outcome.reasoning_stream.is_empty());
+    }
 }
diff --git a/llama-cpp-test-harness/src/fixtures_dir.rs → llama-cpp-bindings-tests/src/fixtures_dir.rs b/llama-cpp-test-harness/src/fixtures_dir.rs → llama-cpp-bindings-tests/src/fixtures_dir.rs
diff --git a/llama-cpp-bindings-tests/src/lib.rs b/llama-cpp-bindings-tests/src/lib.rs
@@ -1,2 +1,6 @@
+pub mod build_user_prompt_with_media_marker;
+pub mod chunk_token_breakdown;
 pub mod classify_sample_loop;
+pub mod fixtures_dir;
 pub mod prime_kv_cache;
+pub mod prime_kv_cache_with;
diff --git a/llama-cpp-bindings-tests/src/prime_kv_cache.rs b/llama-cpp-bindings-tests/src/prime_kv_cache.rs
@@ -1,15 +1,11 @@
 use anyhow::Result;
 use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
 use llama_cpp_test_harness::LlamaFixture;
 
+use crate::prime_kv_cache_with::prime_kv_cache_with;
+
 /// # Errors
 /// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim.
 pub fn prime_kv_cache(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> {
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-    Ok(())
+    prime_kv_cache_with(fixture, context, "Hello world", 512)
 }
diff --git a/llama-cpp-bindings-tests/src/prime_kv_cache_with.rs b/llama-cpp-bindings-tests/src/prime_kv_cache_with.rs
@@ -0,0 +1,20 @@
+use anyhow::Result;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_test_harness::LlamaFixture;
+
+/// # Errors
+/// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim.
+pub fn prime_kv_cache_with(
+    fixture: &LlamaFixture<'_>,
+    context: &mut LlamaContext<'_>,
+    text: &str,
+    batch_capacity: usize,
+) -> Result<()> {
+    let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(batch_capacity, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+    Ok(())
+}