intentee · mcharytoniuk · Jun 10, 2026 · Jun 10, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -41,6 +41,7 @@ async-trait = "0.1"
 bytes = "1.11"
 cadence = "1.6"
 clap = { version = "4.5", features = ["derive"] }
+command_handler = "=0.1.0"
 crossterm = { version = "=0.29.0", features = ["event-stream"] }
 dashmap = "6.1"
 encoding_rs = { version = "0.8", features = ["serde"] }
@@ -55,9 +56,9 @@ http = "1"
 image = "0.25"
 indoc = "2"
 jsonschema = { version = "0.37", default-features = false }
-llama-cpp-bindings = "=0.7.0"
-llama-cpp-bindings-sys = "=0.7.0"
-llama-cpp-bindings-types = "=0.7.0"
+llama-cpp-bindings = "=0.9.0"
+llama-cpp-bindings-sys = "=0.9.0"
+llama-cpp-bindings-types = "=0.9.0"
 base64 = "0.22"
 log = "0.4"
 mime_guess = "2"
@@ -118,26 +119,32 @@ pedantic = { level = "deny", priority = -1 }
 
 # Specific linter settings
 allow_attributes = "deny"
+cast_possible_truncation = "allow"
+cast_sign_loss = "allow"
 dbg_macro = "deny"
-error_impl_error = "deny"
+error_impl_error = "allow"
 expect_used = "deny"
+future_not_send = "allow"
 infinite_loop = "deny"
 literal_string_with_formatting_args = "allow"
 missing_docs_in_private_items = "allow"
 missing_errors_doc = "allow"
 missing_panics_doc = "allow"
 module_name_repetitions = "allow"
+needless_pass_by_value = "allow"
 panic = "deny"
 print_stderr = "deny"
 print_stdout = "deny"
 rc_mutex = "deny"
 rest_pat_in_fully_bound_structs = "deny"
 self_named_module_files = "deny"
+significant_drop_tightening = "allow"
 str_to_string = "deny"
 # integration tests live in tests/*.rs which is already compiled test-only by Cargo;
 # wrapping every file in `#[cfg(test)] mod tests { }` would be redundant.
 tests_outside_test_module = "allow"
 todo = "deny"
+too_many_arguments = "allow"
 too_many_lines = "allow"
 unimplemented = "deny"
 unwrap_used = "deny"

diff --git a/paddler_agent/src/chat_template_renderer/mod.rs b/paddler_agent/src/chat_template_renderer/mod.rs
@@ -4,6 +4,7 @@ pub mod raise_exception;
 use anyhow::Context as _;
 use anyhow::Result;
 use minijinja::Environment;
+use minijinja_contrib::add_to_environment;
 use minijinja_contrib::pycompat::unknown_method_callback;
 use paddler_messaging::chat_template::ChatTemplate;
 use serde::ser::Serialize;
@@ -25,7 +26,7 @@ impl ChatTemplateRenderer {
         minijinja_env.add_template_owned(CHAT_TEMPLATE_NAME, content)?;
         minijinja_env.set_unknown_method_callback(unknown_method_callback);
 
-        minijinja_contrib::add_to_environment(&mut minijinja_env);
+        add_to_environment(&mut minijinja_env);
         minijinja_env.add_filter("tojson", pyjinja_tojson);
 
         Ok(Self { minijinja_env })

diff --git a/paddler_agent/src/chat_template_renderer/pyjinja_tojson.rs b/paddler_agent/src/chat_template_renderer/pyjinja_tojson.rs
@@ -4,11 +4,6 @@ use minijinja::Value;
 use minijinja::filters::tojson;
 use minijinja::value::Kwargs;
 
-#[expect(
-    clippy::needless_pass_by_value,
-    reason = "minijinja's Filter trait requires Kwargs by value; taking &Kwargs makes the \
-              function unregisterable as a filter"
-)]
 pub fn pyjinja_tojson(value: &Value, kwargs: Kwargs) -> Result<Value, Error> {
     let indent: Option<Value> = kwargs.get("indent")?;
 

diff --git a/paddler_agent/src/continuous_batch_scheduler/advance_generating_phase.rs b/paddler_agent/src/continuous_batch_scheduler/advance_generating_phase.rs
@@ -79,7 +79,19 @@ impl AdvanceGeneratingPhase<'_> {
             }
         };
 
-        let classified_outcomes = classify_token_phase::run(request, raw_token);
+        let classified_outcomes = match classify_token_phase::run(request, raw_token) {
+            Ok(outcomes) => outcomes,
+            Err(error) => {
+                error!(
+                    "{:?}: sequence {} token classification failed: {error:#}",
+                    self.scheduler_context.agent_name, request.state.sequence_id
+                );
+
+                return Some(AdvanceOutcome::Completed(
+                    GeneratedTokenResult::DetokenizationFailed(error.to_string()),
+                ));
+            }
+        };
 
         let completion_phase = CompletionCheckPhase {
             model: &self.scheduler_context.model,

diff --git a/paddler_agent/src/continuous_batch_scheduler/classify_token_phase.rs b/paddler_agent/src/continuous_batch_scheduler/classify_token_phase.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use llama_cpp_bindings::SampledToken;
 use llama_cpp_bindings::sampled_token_classifier::IngestOutcome;
 use llama_cpp_bindings::sampled_token_classifier::SampledTokenSection;
@@ -9,10 +10,11 @@ use crate::continuous_batch_scheduler::classified_token::ClassifiedToken;
 pub fn run(
     request: &mut ContinuousBatchActiveRequest,
     raw_token: LlamaToken,
-) -> Vec<ClassifiedToken> {
+) -> Result<Vec<ClassifiedToken>> {
     let section_before_ingest = request.token_classifier.current_section();
-    let outcomes = request.token_classifier.ingest(raw_token);
-    classify_ingest_outcomes(outcomes, section_before_ingest)
+    let outcomes = request.token_classifier.ingest(raw_token)?;
+
+    Ok(classify_ingest_outcomes(outcomes, section_before_ingest))
 }
 
 fn classify_ingest_outcomes(

diff --git a/paddler_agent/src/continuous_batch_scheduler/mod.rs b/paddler_agent/src/continuous_batch_scheduler/mod.rs
@@ -385,15 +385,19 @@ impl ContinuousBatchScheduler {
     )]
     fn build_token_classifier_for_active_request(
         &self,
-    ) -> llama_cpp_bindings::SampledTokenClassifier<'static> {
-        let classifier = self.scheduler_context.model.sampled_token_classifier();
+    ) -> Result<llama_cpp_bindings::SampledTokenClassifier<'static>> {
+        let classifier = self
+            .scheduler_context
+            .model
+            .sampled_token_classifier()
+            .context("failed to build the sampled token classifier")?;
 
-        unsafe {
+        Ok(unsafe {
             std::mem::transmute::<
                 llama_cpp_bindings::SampledTokenClassifier<'_>,
                 llama_cpp_bindings::SampledTokenClassifier<'static>,
             >(classifier)
-        }
+        })
     }
 
     fn build_tool_call_pipeline(
@@ -431,10 +435,6 @@ impl ContinuousBatchScheduler {
         Ok(ToolCallPipelineBuildOutcome::Ready(pipeline))
     }
 
-    #[expect(
-        clippy::too_many_arguments,
-        reason = "these are distinct concerns (the prompt, the generation config, the output channel, the stop signal, the slot guard) that do not form a cohesive value object; bundling them would violate single-responsibility grouping"
-    )]
     fn accept_text_prompt(
         &mut self,
         prompt: &str,
@@ -527,7 +527,7 @@ impl ContinuousBatchScheduler {
 
         let chain = self.create_sampler_chain();
 
-        let mut token_classifier = self.build_token_classifier_for_active_request();
+        let mut token_classifier = self.build_token_classifier_for_active_request()?;
 
         token_classifier.record_prompt_tokens(prompt_tokens.len() as u64);
         token_classifier.ingest_prompt_tokens(&prompt_tokens);
@@ -563,10 +563,6 @@ impl ContinuousBatchScheduler {
         Ok(())
     }
 
-    #[expect(
-        clippy::too_many_arguments,
-        reason = "these are distinct concerns (the multimodal context, prompt, images, generation config, the output channel, the stop signal, the slot guard) that do not form a cohesive value object; bundling them would violate single-responsibility grouping"
-    )]
     fn accept_multimodal_request(
         &mut self,
         multimodal_context: &MtmdContext,
@@ -685,7 +681,7 @@ impl ContinuousBatchScheduler {
 
         self.harvest_pending_samples_before_external_decode();
 
-        let mut token_classifier = self.build_token_classifier_for_active_request();
+        let mut token_classifier = self.build_token_classifier_for_active_request()?;
 
         let batch_size_i32 = i32::try_from(batch_size).context("batch_size does not fit in i32")?;
 
@@ -810,7 +806,19 @@ impl ContinuousBatchScheduler {
                     // batch via `pending_sampled_token`; their user-visible emission
                     // happens in `advance_generating_phase` after the next decode,
                     // not here.
-                    let _ = active_request.token_classifier.ingest(raw_token);
+                    if let Err(error) = active_request.token_classifier.ingest(raw_token) {
+                        error!(
+                            "{:?}: sequence {} pre-eval harvest detokenization error: {error:#}",
+                            self.scheduler_context.agent_name, active_request.state.sequence_id
+                        );
+                        active_request.complete_with_outcome(
+                            self.scheduler_context.agent_name.as_deref(),
+                            GeneratedTokenResult::DetokenizationFailed(error.to_string()),
+                        );
+
+                        continue;
+                    }
+
                     active_request.state.pending_sampled_token =
                         Some(llama_cpp_bindings::SampledToken::Content(raw_token));
                     active_request.state.i_batch = None;