Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .claude/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.lock
/workflows/
10 changes: 10 additions & 0 deletions .claude/rules/subproject-llama-cpp-bindings-sys.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
paths:
- "llama-cpp-bindings-sys/**"
---

# `llama-cpp-bindings-sys` Context

- Every CPP exception MUST be surfaced to the Rust side of the project.
- If a CPP issue can be precisely identified, and mapped into an enum on the Rust side, it must be mapped.
- CPP bindings must remain minimal wrappers over `llama.cpp` API. Every logic possible must be moved to Rust, and be unit testable.
9 changes: 9 additions & 0 deletions .claude/rules/subproject-llama-cpp-bindings-types.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
paths:
- "llama-cpp-bindings-types/**"
---

# `llama-cpp-bindings-types` Context

- The purposse of `llama-cpp-bindings-types` is to provide a thin layer of types that do not need to rely on `llama.cpp` vendored library itself
- `llama-cpp-bindings-types` must not depend on llama.cpp bindings themselves
11 changes: 11 additions & 0 deletions .claude/rules/subproject-llama-cpp-test-harness.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
paths:
- "llama-cpp-test-harness/**"
- "llama-cpp-test-harness-macros/**"
---

# `llama-cpp-test-harness` Context

- The purpose of `llama-cpp-test-harness` is to provide a custom harness that optimizes the tests to minimize model swaps.
- It must analyze all the relevant test attributes, and plan the execution to minimize the model swaps
- It needs to group the tests by model type they depend on, and execute them in phases (where each phase represents a different model)
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ members = [
"llama-cpp-bindings-types",
"llama-cpp-bindings",
"llama-cpp-bindings-tests",
"llama-cpp-error-recorder",
"llama-cpp-log-decoder",
"llama-cpp-test-harness",
"llama-cpp-test-harness-macros",
Expand Down Expand Up @@ -33,6 +34,7 @@ llama-cpp-bindings = { path = "llama-cpp-bindings", version = "=0.8.0" }
llama-cpp-bindings-build = { path = "llama-cpp-bindings-build", version = "=0.8.0" }
llama-cpp-bindings-sys = { path = "llama-cpp-bindings-sys", version = "=0.8.0" }
llama-cpp-bindings-types = { path = "llama-cpp-bindings-types", version = "=0.8.0" }
llama-cpp-error-recorder = { path = "llama-cpp-error-recorder", version = "=0.8.0" }
llama-cpp-log-decoder = { path = "llama-cpp-log-decoder", version = "=0.8.0" }
llama-cpp-test-harness = { path = "llama-cpp-test-harness", version = "=0.8.0" }
llama-cpp-test-harness-macros = { path = "llama-cpp-test-harness-macros", version = "=0.8.0" }
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ coverage: node_modules
cargo llvm-cov report
npx rust-coverage-check target/llvm-cov.json \
--workspace-root $(CURDIR) \
--gated llama-cpp-bindings=95 \
--gated llama-cpp-bindings=98 \
--gated llama-cpp-error-recorder=100 \
--gated llama-cpp-log-decoder=100 \
--gated llama-cpp-bindings-types=100 \
--gated llama-cpp-test-harness=99 \
Expand Down
1 change: 1 addition & 0 deletions llama-cpp-bindings-sys/wrapper.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "llama.cpp/include/llama.h"
#include "llama.cpp/ggml/include/gguf.h"
#include "wrapper_chat_apply.h"
#include "wrapper_chat_parse.h"
#include "wrapper_common.h"
#include "wrapper_fit.h"
Expand Down
96 changes: 96 additions & 0 deletions llama-cpp-bindings-sys/wrapper_chat_apply.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include "wrapper_chat_apply.h"
#include "wrapper_token_text.h"

#include "llama.cpp/common/chat-auto-parser.h"
#include "llama.cpp/common/chat.h"
#include "llama.cpp/include/llama.h"

#include <exception>
#include <new>
#include <nlohmann/json.hpp>
#include <string>

using wrapper_helpers::token_text_or_empty;

extern "C" llama_rs_apply_chat_template_status llama_rs_apply_chat_template(
const struct llama_model * model,
const char * template_src,
const char * const * roles,
const char * const * contents,
size_t n_messages,
int add_generation_prompt,
char ** out_string,
char ** out_error) {
if (out_string) {
*out_string = nullptr;
}
if (out_error) {
*out_error = nullptr;
}
if (!model) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MODEL_ARG;
}
if (!template_src) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_TEMPLATE_ARG;
}
if (n_messages > 0 && (!roles || !contents)) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MESSAGES_ARG;
}
if (!out_string) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_STRING_ARG;
}
if (!out_error) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_ERROR_ARG;
}

try {
const llama_vocab * vocab = llama_model_get_vocab(model);
if (!vocab) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_MODEL_HAS_NO_VOCAB;
}

std::string bos_token = token_text_or_empty(vocab, llama_vocab_bos(vocab));
std::string eos_token = token_text_or_empty(vocab, llama_vocab_eos(vocab));

common_chat_template tmpl(template_src, bos_token, eos_token);

nlohmann::ordered_json messages = nlohmann::ordered_json::array();
for (size_t index = 0; index < n_messages; index++) {
messages.push_back({
{ "role", roles[index] ? roles[index] : "" },
{ "content", contents[index] ? contents[index] : "" },
});
}

autoparser::generation_params inputs;
inputs.messages = std::move(messages);
inputs.tools = nlohmann::ordered_json::array();
inputs.add_generation_prompt = add_generation_prompt != 0;

std::string rendered = common_chat_template_direct_apply(tmpl, inputs);
if (rendered.empty()) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_TEMPLATE_APPLICATION_FAILED;
}

*out_string = llama_rs_dup_string(rendered);
if (!*out_string) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
}

return LLAMA_RS_APPLY_CHAT_TEMPLATE_OK;
} catch (const std::bad_alloc &) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
} catch (const std::exception & ex) {
*out_error = llama_rs_dup_string(std::string(ex.what()));
if (!*out_error) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
}
return LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION;
} catch (...) {
*out_error = llama_rs_dup_string(std::string("unknown c++ exception"));
if (!*out_error) {
return LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED;
}
return LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION;
}
}
37 changes: 37 additions & 0 deletions llama-cpp-bindings-sys/wrapper_chat_apply.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include "llama.cpp/include/llama.h"
#include "wrapper_utils.h"

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef enum llama_rs_apply_chat_template_status {
LLAMA_RS_APPLY_CHAT_TEMPLATE_OK = 0,
LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MODEL_ARG,
LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_TEMPLATE_ARG,
LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_MESSAGES_ARG,
LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_STRING_ARG,
LLAMA_RS_APPLY_CHAT_TEMPLATE_NULL_OUT_ERROR_ARG,
LLAMA_RS_APPLY_CHAT_TEMPLATE_MODEL_HAS_NO_VOCAB,
LLAMA_RS_APPLY_CHAT_TEMPLATE_TEMPLATE_APPLICATION_FAILED,
LLAMA_RS_APPLY_CHAT_TEMPLATE_ERROR_STRING_ALLOCATION_FAILED,
LLAMA_RS_APPLY_CHAT_TEMPLATE_VENDORED_THREW_CXX_EXCEPTION,
} llama_rs_apply_chat_template_status;

llama_rs_apply_chat_template_status llama_rs_apply_chat_template(
const struct llama_model * model,
const char * template_src,
const char * const * roles,
const char * const * contents,
size_t n_messages,
int add_generation_prompt,
char ** out_string,
char ** out_error);

#ifdef __cplusplus
}
#endif
Binary file added llama-cpp-bindings-tests/fixtures/orange_cat.wav
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
use anyhow::Result;
use llama_cpp_bindings::model::LlamaChatMessage;
use llama_cpp_bindings::model::LlamaModel;
use llama_cpp_bindings::mtmd::mtmd_default_marker;

/// # Errors
///
/// Forwards chat-template lookup, message construction, and template application errors.
pub fn build_user_prompt_with_media_marker(model: &LlamaModel, question: &str) -> Result<String> {
let marker = mtmd_default_marker()?;
let user_content = format!("{marker}{question}");
let chat_template = model.chat_template(None)?;
let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];

Ok(model.apply_chat_template(&chat_template, &messages, true)?)
}
36 changes: 36 additions & 0 deletions llama-cpp-bindings-tests/src/chunk_token_breakdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use anyhow::Context;
use anyhow::Result;
use llama_cpp_bindings::mtmd::MtmdInputChunkType;
use llama_cpp_bindings::mtmd::MtmdInputChunks;

pub struct ChunkTokenBreakdown {
pub text: u64,
pub image: u64,
pub audio: u64,
}

impl ChunkTokenBreakdown {
/// # Errors
///
/// Forwards chunk access and chunk-type classification errors.
pub fn from_chunks(chunks: &MtmdInputChunks) -> Result<Self> {
let mut breakdown = Self {
text: 0,
image: 0,
audio: 0,
};
for index in 0..chunks.len() {
let chunk = chunks
.get(index)
.with_context(|| format!("chunk index {index} is missing"))?;
let n_tokens = u64::try_from(chunk.n_tokens())?;
match chunk.chunk_type()? {
MtmdInputChunkType::Text => breakdown.text += n_tokens,
MtmdInputChunkType::Image => breakdown.image += n_tokens,
MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
}
}

Ok(breakdown)
}
}
46 changes: 46 additions & 0 deletions llama-cpp-bindings-tests/src/classify_sample_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,50 @@ mod tests {
assert_eq!(outcome.observed_reasoning, 0);
assert_eq!(outcome.observed_undeterminable, 0);
}

#[test]
fn record_outcome_reasoning_token_streams_visible_piece() {
let ingest = IngestOutcome {
sampled_token: SampledToken::Reasoning(LlamaToken(7)),
visible_piece: "thinking".to_string(),
raw_piece: String::new(),
};
let mut outcome = ClassifySampleLoopOutcome::default();

record_outcome(&ingest, &mut outcome, false);

assert_eq!(outcome.observed_reasoning, 1);
assert_eq!(outcome.reasoning_stream, "thinking");
}

#[test]
fn record_outcome_reasoning_token_at_end_of_generation_is_not_streamed() {
let ingest = IngestOutcome {
sampled_token: SampledToken::Reasoning(LlamaToken(7)),
visible_piece: "thinking".to_string(),
raw_piece: String::new(),
};
let mut outcome = ClassifySampleLoopOutcome::default();

record_outcome(&ingest, &mut outcome, true);

assert_eq!(outcome.observed_reasoning, 1);
assert!(outcome.reasoning_stream.is_empty());
}

#[test]
fn record_outcome_undeterminable_token_counts_without_streaming() {
let ingest = IngestOutcome {
sampled_token: SampledToken::Undeterminable(LlamaToken(9)),
visible_piece: "ignored".to_string(),
raw_piece: String::new(),
};
let mut outcome = ClassifySampleLoopOutcome::default();

record_outcome(&ingest, &mut outcome, false);

assert_eq!(outcome.observed_undeterminable, 1);
assert!(outcome.content_stream.is_empty());
assert!(outcome.reasoning_stream.is_empty());
}
}
4 changes: 4 additions & 0 deletions llama-cpp-bindings-tests/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
pub mod build_user_prompt_with_media_marker;
pub mod chunk_token_breakdown;
pub mod classify_sample_loop;
pub mod fixtures_dir;
pub mod prime_kv_cache;
pub mod prime_kv_cache_with;
10 changes: 3 additions & 7 deletions llama-cpp-bindings-tests/src/prime_kv_cache.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
use anyhow::Result;
use llama_cpp_bindings::context::LlamaContext;
use llama_cpp_bindings::llama_batch::LlamaBatch;
use llama_cpp_bindings::model::AddBos;
use llama_cpp_test_harness::LlamaFixture;

use crate::prime_kv_cache_with::prime_kv_cache_with;

/// # Errors
/// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim.
pub fn prime_kv_cache(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> {
let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
let mut batch = LlamaBatch::new(512, 1)?;
batch.add_sequence(&tokens, 0, false)?;
context.decode(&mut batch)?;
Ok(())
prime_kv_cache_with(fixture, context, "Hello world", 512)
}
20 changes: 20 additions & 0 deletions llama-cpp-bindings-tests/src/prime_kv_cache_with.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use anyhow::Result;
use llama_cpp_bindings::context::LlamaContext;
use llama_cpp_bindings::llama_batch::LlamaBatch;
use llama_cpp_bindings::model::AddBos;
use llama_cpp_test_harness::LlamaFixture;

/// # Errors
/// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim.
pub fn prime_kv_cache_with(
fixture: &LlamaFixture<'_>,
context: &mut LlamaContext<'_>,
text: &str,
batch_capacity: usize,
) -> Result<()> {
let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
let mut batch = LlamaBatch::new(batch_capacity, 1)?;
batch.add_sequence(&tokens, 0, false)?;
context.decode(&mut batch)?;
Ok(())
}
Loading
Loading