Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async-trait = "0.1"
bytes = "1.11"
cadence = "1.6"
clap = { version = "4.5", features = ["derive"] }
command_handler = "=0.1.0"
crossterm = { version = "=0.29.0", features = ["event-stream"] }
dashmap = "6.1"
encoding_rs = { version = "0.8", features = ["serde"] }
Expand All @@ -55,9 +56,9 @@ http = "1"
image = "0.25"
indoc = "2"
jsonschema = { version = "0.37", default-features = false }
llama-cpp-bindings = "=0.7.0"
llama-cpp-bindings-sys = "=0.7.0"
llama-cpp-bindings-types = "=0.7.0"
llama-cpp-bindings = "=0.9.0"
llama-cpp-bindings-sys = "=0.9.0"
llama-cpp-bindings-types = "=0.9.0"
base64 = "0.22"
log = "0.4"
mime_guess = "2"
Expand Down Expand Up @@ -118,26 +119,32 @@ pedantic = { level = "deny", priority = -1 }

# Specific linter settings
allow_attributes = "deny"
cast_possible_truncation = "allow"
cast_sign_loss = "allow"
dbg_macro = "deny"
error_impl_error = "deny"
error_impl_error = "allow"
expect_used = "deny"
future_not_send = "allow"
infinite_loop = "deny"
literal_string_with_formatting_args = "allow"
missing_docs_in_private_items = "allow"
missing_errors_doc = "allow"
missing_panics_doc = "allow"
module_name_repetitions = "allow"
needless_pass_by_value = "allow"
panic = "deny"
print_stderr = "deny"
print_stdout = "deny"
rc_mutex = "deny"
rest_pat_in_fully_bound_structs = "deny"
self_named_module_files = "deny"
significant_drop_tightening = "allow"
str_to_string = "deny"
# integration tests live in tests/*.rs which is already compiled test-only by Cargo;
# wrapping every file in `#[cfg(test)] mod tests { }` would be redundant.
tests_outside_test_module = "allow"
todo = "deny"
too_many_arguments = "allow"
too_many_lines = "allow"
unimplemented = "deny"
unwrap_used = "deny"
Expand Down
3 changes: 2 additions & 1 deletion paddler_agent/src/chat_template_renderer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod raise_exception;
use anyhow::Context as _;
use anyhow::Result;
use minijinja::Environment;
use minijinja_contrib::add_to_environment;
use minijinja_contrib::pycompat::unknown_method_callback;
use paddler_messaging::chat_template::ChatTemplate;
use serde::ser::Serialize;
Expand All @@ -25,7 +26,7 @@ impl ChatTemplateRenderer {
minijinja_env.add_template_owned(CHAT_TEMPLATE_NAME, content)?;
minijinja_env.set_unknown_method_callback(unknown_method_callback);

minijinja_contrib::add_to_environment(&mut minijinja_env);
add_to_environment(&mut minijinja_env);
minijinja_env.add_filter("tojson", pyjinja_tojson);

Ok(Self { minijinja_env })
Expand Down
5 changes: 0 additions & 5 deletions paddler_agent/src/chat_template_renderer/pyjinja_tojson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@ use minijinja::Value;
use minijinja::filters::tojson;
use minijinja::value::Kwargs;

#[expect(
clippy::needless_pass_by_value,
reason = "minijinja's Filter trait requires Kwargs by value; taking &Kwargs makes the \
function unregisterable as a filter"
)]
pub fn pyjinja_tojson(value: &Value, kwargs: Kwargs) -> Result<Value, Error> {
let indent: Option<Value> = kwargs.get("indent")?;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,19 @@ impl AdvanceGeneratingPhase<'_> {
}
};

let classified_outcomes = classify_token_phase::run(request, raw_token);
let classified_outcomes = match classify_token_phase::run(request, raw_token) {
Ok(outcomes) => outcomes,
Err(error) => {
error!(
"{:?}: sequence {} token classification failed: {error:#}",
self.scheduler_context.agent_name, request.state.sequence_id
);

return Some(AdvanceOutcome::Completed(
GeneratedTokenResult::DetokenizationFailed(error.to_string()),
));
}
};

let completion_phase = CompletionCheckPhase {
model: &self.scheduler_context.model,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use anyhow::Result;
use llama_cpp_bindings::SampledToken;
use llama_cpp_bindings::sampled_token_classifier::IngestOutcome;
use llama_cpp_bindings::sampled_token_classifier::SampledTokenSection;
Expand All @@ -9,10 +10,11 @@ use crate::continuous_batch_scheduler::classified_token::ClassifiedToken;
pub fn run(
request: &mut ContinuousBatchActiveRequest,
raw_token: LlamaToken,
) -> Vec<ClassifiedToken> {
) -> Result<Vec<ClassifiedToken>> {
let section_before_ingest = request.token_classifier.current_section();
let outcomes = request.token_classifier.ingest(raw_token);
classify_ingest_outcomes(outcomes, section_before_ingest)
let outcomes = request.token_classifier.ingest(raw_token)?;

Ok(classify_ingest_outcomes(outcomes, section_before_ingest))
}

fn classify_ingest_outcomes(
Expand Down
38 changes: 23 additions & 15 deletions paddler_agent/src/continuous_batch_scheduler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,15 +385,19 @@ impl ContinuousBatchScheduler {
)]
fn build_token_classifier_for_active_request(
&self,
) -> llama_cpp_bindings::SampledTokenClassifier<'static> {
let classifier = self.scheduler_context.model.sampled_token_classifier();
) -> Result<llama_cpp_bindings::SampledTokenClassifier<'static>> {
let classifier = self
.scheduler_context
.model
.sampled_token_classifier()
.context("failed to build the sampled token classifier")?;

unsafe {
Ok(unsafe {
std::mem::transmute::<
llama_cpp_bindings::SampledTokenClassifier<'_>,
llama_cpp_bindings::SampledTokenClassifier<'static>,
>(classifier)
}
})
}

fn build_tool_call_pipeline(
Expand Down Expand Up @@ -431,10 +435,6 @@ impl ContinuousBatchScheduler {
Ok(ToolCallPipelineBuildOutcome::Ready(pipeline))
}

#[expect(
clippy::too_many_arguments,
reason = "these are distinct concerns (the prompt, the generation config, the output channel, the stop signal, the slot guard) that do not form a cohesive value object; bundling them would violate single-responsibility grouping"
)]
fn accept_text_prompt(
&mut self,
prompt: &str,
Expand Down Expand Up @@ -527,7 +527,7 @@ impl ContinuousBatchScheduler {

let chain = self.create_sampler_chain();

let mut token_classifier = self.build_token_classifier_for_active_request();
let mut token_classifier = self.build_token_classifier_for_active_request()?;

token_classifier.record_prompt_tokens(prompt_tokens.len() as u64);
token_classifier.ingest_prompt_tokens(&prompt_tokens);
Expand Down Expand Up @@ -563,10 +563,6 @@ impl ContinuousBatchScheduler {
Ok(())
}

#[expect(
clippy::too_many_arguments,
reason = "these are distinct concerns (the multimodal context, prompt, images, generation config, the output channel, the stop signal, the slot guard) that do not form a cohesive value object; bundling them would violate single-responsibility grouping"
)]
fn accept_multimodal_request(
&mut self,
multimodal_context: &MtmdContext,
Expand Down Expand Up @@ -685,7 +681,7 @@ impl ContinuousBatchScheduler {

self.harvest_pending_samples_before_external_decode();

let mut token_classifier = self.build_token_classifier_for_active_request();
let mut token_classifier = self.build_token_classifier_for_active_request()?;

let batch_size_i32 = i32::try_from(batch_size).context("batch_size does not fit in i32")?;

Expand Down Expand Up @@ -810,7 +806,19 @@ impl ContinuousBatchScheduler {
// batch via `pending_sampled_token`; their user-visible emission
// happens in `advance_generating_phase` after the next decode,
// not here.
let _ = active_request.token_classifier.ingest(raw_token);
if let Err(error) = active_request.token_classifier.ingest(raw_token) {
error!(
"{:?}: sequence {} pre-eval harvest detokenization error: {error:#}",
self.scheduler_context.agent_name, active_request.state.sequence_id
);
active_request.complete_with_outcome(
self.scheduler_context.agent_name.as_deref(),
GeneratedTokenResult::DetokenizationFailed(error.to_string()),
);

continue;
}

active_request.state.pending_sampled_token =
Some(llama_cpp_bindings::SampledToken::Content(raw_token));
active_request.state.i_batch = None;
Expand Down
Loading
Loading