diff --git a/.gitattributes b/.gitattributes
index e27f70f..8b5d28d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,4 @@
 *.pbxproj -text
 # specific for windows script files
 *.bat text eol=crlf
+android/src/main/jniLibs/arm64-v8a/libcactus.a filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 1a7dd08..e42d6af 100644
--- a/README.md
+++ b/README.md
@@ -649,18 +649,18 @@ console.log('Language:', result.language);  // e.g. 'en'
 console.log('Confidence:', result.confidence);
 ```
 
-## Voice Activity Detection (VAD)
+## Audio Processing
 
-The `CactusVAD` class detects speech segments in audio, returning timestamped intervals where speech is present.
+The `CactusAudio` class provides voice activity detection (VAD), speaker diarization, and speaker embedding extraction.
 
-### Class
+### Voice Activity Detection
 
 ```typescript
-import { CactusVAD } from 'cactus-react-native';
+import { CactusAudio } from 'cactus-react-native';
 
-const cactusVAD = new CactusVAD({ model: 'silero-vad' });
+const cactusAudio = new CactusAudio({ model: 'silero-vad' });
 
-const result = await cactusVAD.vad({
+const result = await cactusAudio.vad({
   audio: 'path/to/audio.wav',
   options: {
     threshold: 0.5,
@@ -674,22 +674,68 @@ console.log('Speech segments:', result.segments);
 console.log('Total time (ms):', result.totalTime);
 ```
 
+### Speaker Diarization
+
+```typescript
+import { CactusAudio } from 'cactus-react-native';
+
+const cactusAudio = new CactusAudio({ model: 'silero-vad' });
+
+const result = await cactusAudio.diarize({
+  audio: 'path/to/audio.wav',
+  options: {
+    numSpeakers: 2,
+    minSpeakers: 1,
+    maxSpeakers: 4,
+  }
+});
+
+console.log('Number of speakers:', result.numSpeakers);
+console.log('Scores:', result.scores);
+```
+
+### Speaker Embedding
+
+```typescript
+import { CactusAudio } from 'cactus-react-native';
+
+const cactusAudio = new CactusAudio({ model: 'silero-vad' });
+
+const result = await cactusAudio.embedSpeaker({
+  audio: 'path/to/audio.wav',
+});
+
+console.log('Speaker embedding:', result.embedding);
+```
+
 ### Hook
 
 ```tsx
-import { useCactusVAD } from 'cactus-react-native';
+import { useCactusAudio } from 'cactus-react-native';
 
 const App = () => {
-  const cactusVAD = useCactusVAD({ model: 'silero-vad' });
+  const cactusAudio = useCactusAudio({ model: 'silero-vad' });
 
   const handleVAD = async () => {
-    const result = await cactusVAD.vad({
+    const result = await cactusAudio.vad({
       audio: 'path/to/audio.wav',
     });
     console.log('Speech segments:', result.segments);
   };
 
-  return <Button title="Detect Speech" onPress={handleVAD} />;
+  const handleDiarize = async () => {
+    const result = await cactusAudio.diarize({
+      audio: 'path/to/audio.wav',
+    });
+    console.log('Speakers:', result.numSpeakers);
+  };
+
+  return (
+    <>
+      <Button title="Detect Speech" onPress={handleVAD} />
+      <Button title="Diarize" onPress={handleDiarize} />
+    </>
+  );
 };
 ```
 
@@ -985,9 +1031,19 @@ Performs text completion with optional streaming and tool support. Automatically
   - `toolRagTopK` - Number of tools to select via RAG when tool list is large (default: `2`).
   - `includeStopSequences` - Whether to include stop sequences in the response (default: `false`).
   - `useVad` - Whether to use VAD preprocessing (default: `true`).
+  - `enableThinking` - Whether to enable thinking/reasoning output if supported by the model (default: unset).
 - `tools` - Array of `CactusLMTool` objects for function calling.
 - `onToken` - Callback for streaming tokens.
 
+**`prefill(params: CactusLMPrefillParams): Promise<CactusLMPrefillResult>`**
+
+Runs prompt prefill without generating any output tokens. Useful for measuring prefill performance or warming up the model's KV cache. Automatically calls `init()` if not already initialized. Throws an error if a generation is already in progress.
+
+**Parameters:**
+- `messages` - Array of `CactusLMMessage` objects.
+- `options` - Same options as `complete`.
+- `tools` - Array of `CactusLMTool` objects.
+
 **`tokenize(params: CactusLMTokenizeParams): Promise<CactusLMTokenizeResult>`**
 
 Converts text into tokens using the model's tokenizer.
@@ -1038,7 +1094,7 @@ Returns available models.
 
 **`getModelName(): string`**
 
-Returns the model slug or path the instance was created with.
+Returns the computed model identifier including quantization and pro suffix (e.g., `'qwen3-0.6b-int8'`, `'lfm2-vl-450m-int4-pro'`).
 
 ### useCactusLM Hook
 
@@ -1121,6 +1177,7 @@ Starts a streaming transcription session. Automatically calls `init()` if not al
 - `confirmationThreshold` - Fuzzy match ratio required to confirm a transcription segment (default: `0.99`).
 - `minChunkSize` - Minimum number of audio samples before processing (default: `32000`).
 - `telemetryEnabled` - Enable telemetry for this session (default: `true`).
+- `language` - Language code for transcription (e.g., `'en'`, `'es'`, `'fr'`). If not set, language is auto-detected.
 
 **`streamTranscribeProcess(params: CactusSTTStreamTranscribeProcessParams): Promise<CactusSTTStreamTranscribeProcessResult>`**
 
@@ -1167,7 +1224,7 @@ Returns available speech-to-text models.
 
 **`getModelName(): string`**
 
-Returns the model slug or path the instance was created with.
+Returns the computed model identifier including quantization and pro suffix (e.g., `'whisper-small-int8'`).
 
 ### useCactusSTT Hook
 
@@ -1200,32 +1257,32 @@ The `useCactusSTT` hook manages a `CactusSTT` instance with reactive state. When
 - `destroy(): Promise<void>` - Releases all resources associated with the model. Clears the `transcription`, `streamTranscribeConfirmed`, and `streamTranscribePending` state. Automatically called when the component unmounts.
 - `getModels(): Promise<CactusModel[]>` - Returns available speech-to-text models.
 
-### CactusVAD Class
+### CactusAudio Class
 
 #### Constructor
 
-**`new CactusVAD(params?: CactusVADParams)`**
+**`new CactusAudio(params?: CactusAudioParams)`**
 
 **Parameters:**
-- `model` - Model slug or absolute path to a VAD model file (default: `'silero-vad'`).
+- `model` - Model slug or absolute path to an audio model file (default: `'silero-vad'`).
 - `options` - Model options:
   - `quantization` - Quantization type: `'int4'` | `'int8'` (default: `'int8'`).
   - `pro` - Enable NPU-accelerated models (default: `false`).
 
 #### Methods
 
-**`download(params?: CactusVADDownloadParams): Promise<void>`**
+**`download(params?: CactusAudioDownloadParams): Promise<void>`**
 
-Downloads the VAD model. If the model is already downloaded, returns immediately with progress `1`. Throws an error if a download is already in progress.
+Downloads the audio model. If the model is already downloaded, returns immediately with progress `1`. Throws an error if a download is already in progress.
 
 **Parameters:**
 - `onProgress` - Callback for download progress (0-1).
 
 **`init(): Promise<void>`**
 
-Initializes the VAD model. Safe to call multiple times (idempotent). Throws an error if the model is not downloaded yet.
+Initializes the audio model. Safe to call multiple times (idempotent). Throws an error if the model is not downloaded yet.
 
-**`vad(params: CactusVADVadParams): Promise<CactusVADResult>`**
+**`vad(params: CactusAudioVADParams): Promise<CactusAudioVADResult>`**
 
 Runs voice activity detection on the given audio. Automatically calls `init()` if not already initialized.
 
@@ -1243,21 +1300,41 @@ Runs voice activity detection on the given audio. Automatically calls `init()` i
   - `minSilenceAtMaxSpeech` - Minimum silence at max speech duration.
   - `useMaxPossSilAtMaxSpeech` - Whether to use maximum possible silence at max speech.
 
+**`diarize(params: CactusAudioDiarizeParams): Promise<CactusAudioDiarizeResult>`**
+
+Runs speaker diarization on the given audio. Automatically calls `init()` if not already initialized.
+
+**Parameters:**
+- `audio` - Path to the audio file or raw PCM samples as a byte array.
+- `options` - Diarize options:
+  - `stepMs` - Step size in milliseconds.
+  - `threshold` - Diarization threshold.
+  - `numSpeakers` - Expected number of speakers.
+  - `minSpeakers` - Minimum number of speakers.
+  - `maxSpeakers` - Maximum number of speakers.
+
+**`embedSpeaker(params: CactusAudioEmbedSpeakerParams): Promise<CactusAudioEmbedSpeakerResult>`**
+
+Extracts a speaker embedding vector from the given audio. Automatically calls `init()` if not already initialized.
+
+**Parameters:**
+- `audio` - Path to the audio file or raw PCM samples as a byte array.
+
 **`destroy(): Promise<void>`**
 
 Releases all resources associated with the model. Safe to call even if the model is not initialized.
 
 **`getModels(): Promise<CactusModel[]>`**
 
-Returns available VAD models.
+Returns available audio models.
 
 **`getModelName(): string`**
 
-Returns the model slug or path the instance was created with.
+Returns the computed model identifier including quantization and pro suffix (e.g., `'silero-vad-int8'`).
 
-### useCactusVAD Hook
+### useCactusAudio Hook
 
-The `useCactusVAD` hook manages a `CactusVAD` instance with reactive state. When model parameters (`model`, `options`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
+The `useCactusAudio` hook manages a `CactusAudio` instance with reactive state. When model parameters (`model`, `options`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
 
 #### State
 
@@ -1269,11 +1346,13 @@ The `useCactusVAD` hook manages a `CactusVAD` instance with reactive state. When
 
 #### Methods
 
-- `download(params?: CactusVADDownloadParams): Promise<void>` - Downloads the model. Updates `isDownloading` and `downloadProgress` state during download. Sets `isDownloaded` to `true` on success.
+- `download(params?: CactusAudioDownloadParams): Promise<void>` - Downloads the model. Updates `isDownloading` and `downloadProgress` state during download. Sets `isDownloaded` to `true` on success.
 - `init(): Promise<void>` - Initializes the model.
-- `vad(params: CactusVADVadParams): Promise<CactusVADResult>` - Runs voice activity detection.
+- `vad(params: CactusAudioVADParams): Promise<CactusAudioVADResult>` - Runs voice activity detection.
+- `diarize(params: CactusAudioDiarizeParams): Promise<CactusAudioDiarizeResult>` - Runs speaker diarization.
+- `embedSpeaker(params: CactusAudioEmbedSpeakerParams): Promise<CactusAudioEmbedSpeakerResult>` - Extracts a speaker embedding.
 - `destroy(): Promise<void>` - Releases all resources. Automatically called when the component unmounts.
-- `getModels(): Promise<CactusModel[]>` - Returns available VAD models.
+- `getModels(): Promise<CactusModel[]>` - Returns available audio models.
 
 ### CactusIndex Class
 
@@ -1413,6 +1492,7 @@ interface CactusLMCompleteOptions {
   toolRagTopK?: number;
   includeStopSequences?: boolean;
   useVad?: boolean;
+  enableThinking?: boolean;
 }
 ```
 
@@ -1446,12 +1526,36 @@ interface CactusLMCompleteParams {
 }
 ```
 
+### CactusLMPrefillParams
+
+```typescript
+interface CactusLMPrefillParams {
+  messages: CactusLMMessage[];
+  options?: CactusLMCompleteOptions;
+  tools?: CactusLMTool[];
+}
+```
+
+### CactusLMPrefillResult
+
+```typescript
+interface CactusLMPrefillResult {
+  success: boolean;
+  error: string | null;
+  prefillTokens: number;
+  prefillTps: number;
+  totalTimeMs: number;
+  ramUsageMb: number;
+}
+```
+
 ### CactusLMCompleteResult
 
 ```typescript
 interface CactusLMCompleteResult {
   success: boolean;
   response: string;
+  thinking?: string;
   functionCalls?: {
     name: string;
     arguments: { [key: string]: any };
@@ -1658,6 +1762,7 @@ interface CactusSTTStreamTranscribeStartOptions {
   confirmationThreshold?: number;
   minChunkSize?: number;
   telemetryEnabled?: boolean;
+  language?: string;
 }
 ```
 
@@ -1728,27 +1833,27 @@ interface CactusSTTDetectLanguageResult {
 }
 ```
 
-### CactusVADParams
+### CactusAudioParams
 
 ```typescript
-interface CactusVADParams {
+interface CactusAudioParams {
   model?: string;
   options?: CactusModelOptions;
 }
 ```
 
-### CactusVADDownloadParams
+### CactusAudioDownloadParams
 
 ```typescript
-interface CactusVADDownloadParams {
+interface CactusAudioDownloadParams {
   onProgress?: (progress: number) => void;
 }
 ```
 
-### CactusVADOptions
+### CactusAudioVADOptions
 
 ```typescript
-interface CactusVADOptions {
+interface CactusAudioVADOptions {
   threshold?: number;
   negThreshold?: number;
   minSpeechDurationMs?: number;
@@ -1762,31 +1867,85 @@ interface CactusVADOptions {
 }
 ```
 
-### CactusVADSegment
+### CactusAudioVADSegment
 
 ```typescript
-interface CactusVADSegment {
+interface CactusAudioVADSegment {
   start: number;
   end: number;
 }
 ```
 
-### CactusVADResult
+### CactusAudioVADResult
 
 ```typescript
-interface CactusVADResult {
-  segments: CactusVADSegment[];
+interface CactusAudioVADResult {
+  segments: CactusAudioVADSegment[];
   totalTime: number;
   ramUsage: number;
 }
 ```
 
-### CactusVADVadParams
+### CactusAudioVADParams
 
 ```typescript
-interface CactusVADVadParams {
+interface CactusAudioVADParams {
   audio: string | number[];
-  options?: CactusVADOptions;
+  options?: CactusAudioVADOptions;
+}
+```
+
+### CactusAudioDiarizeOptions
+
+```typescript
+interface CactusAudioDiarizeOptions {
+  stepMs?: number;
+  threshold?: number;
+  numSpeakers?: number;
+  minSpeakers?: number;
+  maxSpeakers?: number;
+}
+```
+
+### CactusAudioDiarizeParams
+
+```typescript
+interface CactusAudioDiarizeParams {
+  audio: string | number[];
+  options?: CactusAudioDiarizeOptions;
+}
+```
+
+### CactusAudioDiarizeResult
+
+```typescript
+interface CactusAudioDiarizeResult {
+  success: boolean;
+  error: string | null;
+  numSpeakers: number;
+  scores: number[];
+  totalTimeMs: number;
+  ramUsageMb: number;
+}
+```
+
+### CactusAudioEmbedSpeakerParams
+
+```typescript
+interface CactusAudioEmbedSpeakerParams {
+  audio: string | number[];
+}
+```
+
+### CactusAudioEmbedSpeakerResult
+
+```typescript
+interface CactusAudioEmbedSpeakerResult {
+  success: boolean;
+  error: string | null;
+  embedding: number[];
+  totalTimeMs: number;
+  ramUsageMb: number;
 }
 ```
 
diff --git a/android/src/main/jniLibs/arm64-v8a/libcactus.a b/android/src/main/jniLibs/arm64-v8a/libcactus.a
index a51d8b0..c3d6138 100644
Binary files a/android/src/main/jniLibs/arm64-v8a/libcactus.a and b/android/src/main/jniLibs/arm64-v8a/libcactus.a differ
diff --git a/cpp/HybridCactus.cpp b/cpp/HybridCactus.cpp
index 6a35bb2..156c99b 100644
--- a/cpp/HybridCactus.cpp
+++ b/cpp/HybridCactus.cpp
@@ -65,7 +65,8 @@ std::shared_ptr<Promise<std::string>> HybridCactus::complete(
                                  responseBuffer.data(), responseBufferSize,
                                  optionsJson ? optionsJson->c_str() : nullptr,
                                  toolsJson ? toolsJson->c_str() : nullptr,
-                                 cactusTokenCallback, &callbackCtx);
+                                 cactusTokenCallback, &callbackCtx,
+                                 nullptr, 0);
 
     if (result < 0) {
       throw std::runtime_error("Cactus complete failed: " +
@@ -79,6 +80,38 @@ std::shared_ptr<Promise<std::string>> HybridCactus::complete(
   });
 }
 
+std::shared_ptr<Promise<std::string>> HybridCactus::prefill(
+    const std::string &messagesJson, double responseBufferSize,
+    const std::optional<std::string> &optionsJson,
+    const std::optional<std::string> &toolsJson) {
+  return Promise<std::string>::async([this, messagesJson, responseBufferSize,
+                                      optionsJson,
+                                      toolsJson]() -> std::string {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_model) {
+      throw std::runtime_error("Cactus model is not initialized");
+    }
+
+    std::string responseBuffer;
+    responseBuffer.resize(responseBufferSize);
+
+    int result = cactus_prefill(this->_model, messagesJson.c_str(),
+                                responseBuffer.data(), responseBufferSize,
+                                optionsJson ? optionsJson->c_str() : nullptr,
+                                toolsJson ? toolsJson->c_str() : nullptr,
+                                nullptr, 0);
+
+    if (result < 0) {
+      throw std::runtime_error("Cactus prefill failed: " +
+                               std::string(cactus_get_last_error()));
+    }
+
+    responseBuffer.resize(strlen(responseBuffer.c_str()));
+    return responseBuffer;
+  });
+}
+
 std::shared_ptr<Promise<std::vector<double>>>
 HybridCactus::tokenize(const std::string &text) {
   return Promise<std::vector<double>>::async([this,
@@ -488,6 +521,102 @@ HybridCactus::audioEmbed(const std::string &audioPath,
       });
 }
 
+std::shared_ptr<Promise<std::string>> HybridCactus::diarize(
+    const std::variant<std::vector<double>, std::string> &audio,
+    double responseBufferSize,
+    const std::optional<std::string> &optionsJson) {
+  return Promise<std::string>::async(
+      [this, audio, responseBufferSize, optionsJson]() -> std::string {
+        std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+        if (!this->_model) {
+          throw std::runtime_error("Cactus model is not initialized");
+        }
+
+        std::string responseBuffer;
+        responseBuffer.resize(responseBufferSize);
+
+        int result;
+        if (std::holds_alternative<std::string>(audio)) {
+          result = cactus_diarize(
+              this->_model, std::get<std::string>(audio).c_str(),
+              responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr, nullptr, 0);
+        } else {
+          const auto &audioDoubles = std::get<std::vector<double>>(audio);
+
+          std::vector<uint8_t> audioBytes;
+          audioBytes.reserve(audioDoubles.size());
+          for (double d : audioDoubles) {
+            d = std::clamp(d, 0.0, 255.0);
+            audioBytes.emplace_back(static_cast<uint8_t>(d));
+          }
+
+          result = cactus_diarize(
+              this->_model, nullptr,
+              responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr,
+              audioBytes.data(), audioBytes.size());
+        }
+
+        if (result < 0) {
+          throw std::runtime_error("Cactus diarize failed: " +
+                                   std::string(cactus_get_last_error()));
+        }
+
+        responseBuffer.resize(strlen(responseBuffer.c_str()));
+        return responseBuffer;
+      });
+}
+
+std::shared_ptr<Promise<std::string>> HybridCactus::embedSpeaker(
+    const std::variant<std::vector<double>, std::string> &audio,
+    double responseBufferSize,
+    const std::optional<std::string> &optionsJson) {
+  return Promise<std::string>::async(
+      [this, audio, responseBufferSize, optionsJson]() -> std::string {
+        std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+        if (!this->_model) {
+          throw std::runtime_error("Cactus model is not initialized");
+        }
+
+        std::string responseBuffer;
+        responseBuffer.resize(responseBufferSize);
+
+        int result;
+        if (std::holds_alternative<std::string>(audio)) {
+          result = cactus_embed_speaker(
+              this->_model, std::get<std::string>(audio).c_str(),
+              responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr, nullptr, 0);
+        } else {
+          const auto &audioDoubles = std::get<std::vector<double>>(audio);
+
+          std::vector<uint8_t> audioBytes;
+          audioBytes.reserve(audioDoubles.size());
+          for (double d : audioDoubles) {
+            d = std::clamp(d, 0.0, 255.0);
+            audioBytes.emplace_back(static_cast<uint8_t>(d));
+          }
+
+          result = cactus_embed_speaker(
+              this->_model, nullptr,
+              responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr,
+              audioBytes.data(), audioBytes.size());
+        }
+
+        if (result < 0) {
+          throw std::runtime_error("Cactus embed speaker failed: " +
+                                   std::string(cactus_get_last_error()));
+        }
+
+        responseBuffer.resize(strlen(responseBuffer.c_str()));
+        return responseBuffer;
+      });
+}
+
 std::shared_ptr<Promise<void>> HybridCactus::reset() {
   return Promise<void>::async([this]() -> void {
     std::lock_guard<std::mutex> lock(this->_modelMutex);
@@ -525,7 +654,7 @@ std::shared_ptr<Promise<void>> HybridCactus::destroy() {
 std::shared_ptr<Promise<void>>
 HybridCactus::setTelemetryEnvironment(const std::string &cacheDir) {
   return Promise<void>::async([cacheDir]() -> void {
-    cactus_set_telemetry_environment("react-native", cacheDir.c_str(), "1.10.0");
+    cactus_set_telemetry_environment("react-native", cacheDir.c_str(), "1.12.0");
   });
 }
 
diff --git a/cpp/HybridCactus.hpp b/cpp/HybridCactus.hpp
index 2c5db1d..16d57e9 100644
--- a/cpp/HybridCactus.hpp
+++ b/cpp/HybridCactus.hpp
@@ -24,6 +24,11 @@ class HybridCactus : public HybridCactusSpec {
                                              double /* tokenId */)>> &callback)
       override;
 
+  std::shared_ptr<Promise<std::string>> prefill(
+      const std::string &messagesJson, double responseBufferSize,
+      const std::optional<std::string> &optionsJson,
+      const std::optional<std::string> &toolsJson) override;
+
   std::shared_ptr<Promise<std::vector<double>>>
   tokenize(const std::string &text) override;
 
@@ -67,6 +72,16 @@ class HybridCactus : public HybridCactusSpec {
   std::shared_ptr<Promise<std::vector<double>>>
   audioEmbed(const std::string &audioPath, double embeddingBufferSize) override;
 
+  std::shared_ptr<Promise<std::string>>
+  diarize(const std::variant<std::vector<double>, std::string> &audio,
+          double responseBufferSize,
+          const std::optional<std::string> &optionsJson) override;
+
+  std::shared_ptr<Promise<std::string>>
+  embedSpeaker(const std::variant<std::vector<double>, std::string> &audio,
+               double responseBufferSize,
+               const std::optional<std::string> &optionsJson) override;
+
   std::shared_ptr<Promise<void>> reset() override;
 
   std::shared_ptr<Promise<void>> stop() override;
diff --git a/cpp/cactus_ffi.h b/cpp/cactus_ffi.h
index aa72986..6e35847 100644
--- a/cpp/cactus_ffi.h
+++ b/cpp/cactus_ffi.h
@@ -41,7 +41,20 @@ CACTUS_FFI_EXPORT int cactus_complete(
     const char* options_json,               // optional
     const char* tools_json,                 // optional
     cactus_token_callback callback,         // optional
-    void* user_data                         // optional
+    void* user_data,                        // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
+);
+
+CACTUS_FFI_EXPORT int cactus_prefill(
+    cactus_model_t model,
+    const char* messages_json,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const char* tools_json,                 // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
 );
 
 CACTUS_FFI_EXPORT int cactus_tokenize(
@@ -140,6 +153,26 @@ CACTUS_FFI_EXPORT int cactus_vad(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_diarize(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_embed_speaker(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT int cactus_rag_query(
     cactus_model_t model,
     const char* query,
@@ -148,7 +181,6 @@ CACTUS_FFI_EXPORT int cactus_rag_query(
     size_t top_k
 );
 
-
 CACTUS_FFI_EXPORT cactus_index_t cactus_index_init(
     const char* index_dir,
     size_t embedding_dim
@@ -199,11 +231,217 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
+// level: 0=DEBUG, 1=INFO, 2=WARN (default), 3=ERROR, 4=NONE
+CACTUS_FFI_EXPORT void cactus_log_set_level(int level);
+
+typedef void (*cactus_log_callback_t)(int level, const char* component, const char* message, void* user_data);
+CACTUS_FFI_EXPORT void cactus_log_set_callback(cactus_log_callback_t callback, void* user_data);
+
 CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
 CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
 CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
 CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
+// cactus graph export
+typedef void* cactus_graph_t;
+typedef uint64_t cactus_node_t;
+
+typedef struct {
+    int32_t precision;
+    size_t rank;
+    size_t shape[8]; 
+    size_t num_elements;
+    size_t byte_size;
+} cactus_tensor_info_t;
+
+CACTUS_FFI_EXPORT cactus_graph_t cactus_graph_create(void);
+CACTUS_FFI_EXPORT void cactus_graph_destroy(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_hard_reset(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_input(
+    cactus_graph_t graph, const size_t* shape, size_t rank, int32_t precision,
+cactus_node_t* out_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_set_input(
+    cactus_graph_t graph, cactus_node_t node, const void* data, int32_t
+precision);
+CACTUS_FFI_EXPORT int cactus_graph_set_external_input(
+    cactus_graph_t graph, cactus_node_t node, void* data, int32_t precision);
+
+CACTUS_FFI_EXPORT int cactus_graph_precision_cast(
+    cactus_graph_t graph, cactus_node_t input, int32_t target_precision, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_quantize_activations(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_add(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_add_clipped(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_subtract(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_multiply(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_divide(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_scalar_add(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_subtract(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_multiply(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_divide(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_exp(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sqrt(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_cos(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sin(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_log(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_abs(cactus_graph_t graph, cactus_node_t x,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_pow(cactus_graph_t graph, cactus_node_t x,
+float exponent, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_view(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_flatten(
+    cactus_graph_t graph, cactus_node_t x, int32_t start_dim, int32_t end_dim,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_reshape(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose(
+    cactus_graph_t graph, cactus_node_t x, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose_n(
+    cactus_graph_t graph, cactus_node_t x, const size_t* permutation, size_t rank, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_slice(
+    cactus_graph_t graph, cactus_node_t x, int32_t axis, size_t start, size_t length, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_index(
+    cactus_graph_t graph, cactus_node_t x, size_t index_value, int32_t dim, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sum(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mean(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_variance(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_min(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_max(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_concat(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, int32_t axis,
+cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_cat(
+    cactus_graph_t graph, const cactus_node_t* nodes, size_t count, int32_t
+axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_matmul(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, bool pretransposed_rhs, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gather(
+    cactus_graph_t graph, cactus_node_t tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_tensor(
+    cactus_graph_t graph, cactus_node_t embedding_tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_file(
+    cactus_graph_t graph, const char* filename, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_embeddings(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_weights(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_bilinear_interpolation(
+    cactus_graph_t graph, cactus_node_t pos_embeds, size_t dst_height, size_t dst_width, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_set_grouped_scales(
+    cactus_graph_t graph, cactus_node_t node, size_t group_size, size_t num_groups, void* scales_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_set_interleaved(
+    cactus_graph_t graph, cactus_node_t node, bool interleaved, size_t original_n);
+CACTUS_FFI_EXPORT int cactus_graph_release_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_prefetch_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_release_all_weight_pages(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_relu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_silu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu_erf(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sigmoid(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_tanh(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_glu(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_layernorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, float epsilon, bool has_bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_groupnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, size_t num_groups, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_batchnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t running_mean, cactus_node_t running_var, int32_t axis, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_topk(cactus_graph_t graph, cactus_node_t input, size_t k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rms_norm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope_gptj(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, size_t rot_dim, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_softmax(cactus_graph_t graph, cactus_node_t input, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, float scale, bool is_causal, size_t position_offset, size_t window_size, int32_t backend, bool use_mask, cactus_node_t mask, bool additive_mask, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rel_pos_bias(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t relative_key, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention_int8_hybrid(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key_new, cactus_node_t value_new, float scale, size_t position_offset,
+    const int8_t* cached_keys, const int8_t* cached_values, const float* k_scales, const float* v_scales,
+    size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_causal(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t kernel_size, size_t dilation, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k7s3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_same_depthwise_k9(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_pointwise(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_depthwise_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_pointwise_1x1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_lstm_cell(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t h_prev, cactus_node_t c_prev, cactus_node_t weight_ih, cactus_node_t weight_hh, cactus_node_t bias_ih, cactus_node_t bias_hh, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_decode(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_prefill(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, size_t chunk_size, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_stft(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, size_t num_fft_bins, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_altup_predict(
+    cactus_graph_t graph, cactus_node_t coefs, const cactus_node_t* streams, size_t num_streams, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_altup_correct(
+    cactus_graph_t graph, cactus_node_t coefs, cactus_node_t innovation, const cactus_node_t* predictions, size_t num_predictions, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gaussian_topk(
+    cactus_graph_t graph, cactus_node_t input, float ppf, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_gated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w3_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_ungated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, int32_t activation, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_sample(
+    cactus_graph_t graph, cactus_node_t logits, float temperature, float top_p, size_t top_k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scatter_topk(
+    cactus_graph_t graph, cactus_node_t indices, cactus_node_t values, size_t num_classes, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_persistent(
+    cactus_graph_t graph, cactus_node_t source_node, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_is_populated(
+    cactus_graph_t graph, cactus_node_t persistent_node, int32_t* out_is_populated);
+CACTUS_FFI_EXPORT int cactus_graph_invalidate_persistent(
+    cactus_graph_t graph, cactus_node_t persistent_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_execute(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_ptr(cactus_graph_t graph,
+cactus_node_t node, void** out_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_info(cactus_graph_t graph,
+cactus_node_t node, cactus_tensor_info_t* out_info);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index 8c6e001..a6880d1 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -1,6 +1,6 @@
 PODS:
   - boost (1.84.0)
-  - Cactus (1.10.4):
+  - Cactus (1.12.0):
     - boost
     - DoubleConversion
     - fast_float
@@ -2643,7 +2643,7 @@ EXTERNAL SOURCES:
 
 SPEC CHECKSUMS:
   boost: 7e761d76ca2ce687f7cc98e698152abd03a18f90
-  Cactus: effc2b16da1131e7bcf3f101d7ad09abd7231a2c
+  Cactus: ce2107540aec408af1ad4e4a7502a9ab87f5a7eb
   DoubleConversion: cb417026b2400c8f53ae97020b2be961b59470cb
   fast_float: b32c788ed9c6a8c584d114d0047beda9664e7cc6
   FBLazyVector: b8f1312d48447cca7b4abc21ed155db14742bd03
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
index aa72986..6e35847 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
@@ -41,7 +41,20 @@ CACTUS_FFI_EXPORT int cactus_complete(
     const char* options_json,               // optional
     const char* tools_json,                 // optional
     cactus_token_callback callback,         // optional
-    void* user_data                         // optional
+    void* user_data,                        // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
+);
+
+CACTUS_FFI_EXPORT int cactus_prefill(
+    cactus_model_t model,
+    const char* messages_json,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const char* tools_json,                 // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
 );
 
 CACTUS_FFI_EXPORT int cactus_tokenize(
@@ -140,6 +153,26 @@ CACTUS_FFI_EXPORT int cactus_vad(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_diarize(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_embed_speaker(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT int cactus_rag_query(
     cactus_model_t model,
     const char* query,
@@ -148,7 +181,6 @@ CACTUS_FFI_EXPORT int cactus_rag_query(
     size_t top_k
 );
 
-
 CACTUS_FFI_EXPORT cactus_index_t cactus_index_init(
     const char* index_dir,
     size_t embedding_dim
@@ -199,11 +231,217 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
+// level: 0=DEBUG, 1=INFO, 2=WARN (default), 3=ERROR, 4=NONE
+CACTUS_FFI_EXPORT void cactus_log_set_level(int level);
+
+typedef void (*cactus_log_callback_t)(int level, const char* component, const char* message, void* user_data);
+CACTUS_FFI_EXPORT void cactus_log_set_callback(cactus_log_callback_t callback, void* user_data);
+
 CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
 CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
 CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
 CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
+// cactus graph export
+typedef void* cactus_graph_t;
+typedef uint64_t cactus_node_t;
+
+typedef struct {
+    int32_t precision;
+    size_t rank;
+    size_t shape[8]; 
+    size_t num_elements;
+    size_t byte_size;
+} cactus_tensor_info_t;
+
+CACTUS_FFI_EXPORT cactus_graph_t cactus_graph_create(void);
+CACTUS_FFI_EXPORT void cactus_graph_destroy(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_hard_reset(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_input(
+    cactus_graph_t graph, const size_t* shape, size_t rank, int32_t precision,
+cactus_node_t* out_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_set_input(
+    cactus_graph_t graph, cactus_node_t node, const void* data, int32_t
+precision);
+CACTUS_FFI_EXPORT int cactus_graph_set_external_input(
+    cactus_graph_t graph, cactus_node_t node, void* data, int32_t precision);
+
+CACTUS_FFI_EXPORT int cactus_graph_precision_cast(
+    cactus_graph_t graph, cactus_node_t input, int32_t target_precision, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_quantize_activations(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_add(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_add_clipped(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_subtract(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_multiply(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_divide(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_scalar_add(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_subtract(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_multiply(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_divide(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_exp(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sqrt(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_cos(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sin(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_log(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_abs(cactus_graph_t graph, cactus_node_t x,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_pow(cactus_graph_t graph, cactus_node_t x,
+float exponent, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_view(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_flatten(
+    cactus_graph_t graph, cactus_node_t x, int32_t start_dim, int32_t end_dim,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_reshape(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose(
+    cactus_graph_t graph, cactus_node_t x, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose_n(
+    cactus_graph_t graph, cactus_node_t x, const size_t* permutation, size_t rank, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_slice(
+    cactus_graph_t graph, cactus_node_t x, int32_t axis, size_t start, size_t length, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_index(
+    cactus_graph_t graph, cactus_node_t x, size_t index_value, int32_t dim, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sum(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mean(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_variance(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_min(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_max(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_concat(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, int32_t axis,
+cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_cat(
+    cactus_graph_t graph, const cactus_node_t* nodes, size_t count, int32_t
+axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_matmul(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, bool pretransposed_rhs, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gather(
+    cactus_graph_t graph, cactus_node_t tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_tensor(
+    cactus_graph_t graph, cactus_node_t embedding_tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_file(
+    cactus_graph_t graph, const char* filename, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_embeddings(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_weights(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_bilinear_interpolation(
+    cactus_graph_t graph, cactus_node_t pos_embeds, size_t dst_height, size_t dst_width, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_set_grouped_scales(
+    cactus_graph_t graph, cactus_node_t node, size_t group_size, size_t num_groups, void* scales_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_set_interleaved(
+    cactus_graph_t graph, cactus_node_t node, bool interleaved, size_t original_n);
+CACTUS_FFI_EXPORT int cactus_graph_release_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_prefetch_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_release_all_weight_pages(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_relu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_silu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu_erf(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sigmoid(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_tanh(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_glu(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_layernorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, float epsilon, bool has_bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_groupnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, size_t num_groups, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_batchnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t running_mean, cactus_node_t running_var, int32_t axis, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_topk(cactus_graph_t graph, cactus_node_t input, size_t k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rms_norm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope_gptj(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, size_t rot_dim, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_softmax(cactus_graph_t graph, cactus_node_t input, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, float scale, bool is_causal, size_t position_offset, size_t window_size, int32_t backend, bool use_mask, cactus_node_t mask, bool additive_mask, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rel_pos_bias(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t relative_key, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention_int8_hybrid(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key_new, cactus_node_t value_new, float scale, size_t position_offset,
+    const int8_t* cached_keys, const int8_t* cached_values, const float* k_scales, const float* v_scales,
+    size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_causal(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t kernel_size, size_t dilation, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k7s3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_same_depthwise_k9(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_pointwise(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_depthwise_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_pointwise_1x1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_lstm_cell(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t h_prev, cactus_node_t c_prev, cactus_node_t weight_ih, cactus_node_t weight_hh, cactus_node_t bias_ih, cactus_node_t bias_hh, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_decode(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_prefill(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, size_t chunk_size, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_stft(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, size_t num_fft_bins, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_altup_predict(
+    cactus_graph_t graph, cactus_node_t coefs, const cactus_node_t* streams, size_t num_streams, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_altup_correct(
+    cactus_graph_t graph, cactus_node_t coefs, cactus_node_t innovation, const cactus_node_t* predictions, size_t num_predictions, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gaussian_topk(
+    cactus_graph_t graph, cactus_node_t input, float ppf, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_gated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w3_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_ungated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, int32_t activation, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_sample(
+    cactus_graph_t graph, cactus_node_t logits, float temperature, float top_p, size_t top_k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scatter_topk(
+    cactus_graph_t graph, cactus_node_t indices, cactus_node_t values, size_t num_classes, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_persistent(
+    cactus_graph_t graph, cactus_node_t source_node, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_is_populated(
+    cactus_graph_t graph, cactus_node_t persistent_node, int32_t* out_is_populated);
+CACTUS_FFI_EXPORT int cactus_graph_invalidate_persistent(
+    cactus_graph_t graph, cactus_node_t persistent_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_execute(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_ptr(cactus_graph_t graph,
+cactus_node_t node, void** out_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_info(cactus_graph_t graph,
+cactus_node_t node, cactus_tensor_info_t* out_info);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
index 3b5d97f..6570f09 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <map>
 #include <stdexcept>
 #include <sstream>
 #include <iomanip>
@@ -63,6 +64,16 @@ struct CactusModelHandle {
     std::unique_ptr<cactus::engine::Model> vad_model;
     std::atomic<bool> should_stop;
     std::vector<uint32_t> processed_tokens;
+    struct ProcessedImage {
+        std::string path;
+        long long last_modified_timestamp = 0;
+
+        bool operator==(const ProcessedImage& other) const {
+            return path == other.path && last_modified_timestamp == other.last_modified_timestamp;
+        }
+    };
+
+    std::vector<std::vector<ProcessedImage>> processed_images;
     std::mutex model_mutex;
     std::string model_name;
     std::unique_ptr<cactus::engine::index::Index> corpus_index;
@@ -124,6 +135,66 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogra
     return cfg;
 }
 
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_htk_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft        = 321;
+    cfg.frame_length = 320;
+    cfg.fft_override = 1024;
+    cfg.hop_length   = 160;
+    cfg.power        = 1.0f;
+    cfg.center       = false;
+    cfg.pad_mode     = "constant";
+    cfg.onesided     = true;
+    cfg.dither       = 0.0f;
+    cfg.mel_floor    = 0.001f;
+    cfg.log_mel      = "log";
+    cfg.reference    = 1.0f;
+    cfg.min_value    = 0.001f;
+    cfg.remove_dc_offset = false;
+    cfg.hann_periodic = true;
+    return cfg;
+}
+
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_gemma4_audio_spectrogram_config(
+    const cactus::engine::Config& model_config) {
+    auto cfg = get_htk_spectrogram_config();
+    cfg.fft_override = model_config.audio_fft_length;
+    cfg.mel_floor_additive = true;
+    return cfg;
+}
+
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_wespeaker_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft            = 512;
+    cfg.frame_length     = 400;
+    cfg.hop_length       = 160;
+    cfg.power            = 2.0f;
+    cfg.center           = false;
+    cfg.pad_mode         = "constant";
+    cfg.onesided         = true;
+    cfg.dither           = 0.0f;
+    cfg.mel_floor        = 1.1754944e-38f;
+    cfg.log_mel          = "log";
+    cfg.reference        = 1.0f;
+    cfg.min_value        = 1.1754944e-38f;
+    cfg.remove_dc_offset = true;
+    cfg.preemphasis      = 0.97f;
+    cfg.hann_periodic    = false;
+    cfg.window_a0        = 0.54f;
+    return cfg;
+}
+
+inline std::vector<float> transpose_mel_to_frame_major(const std::vector<float>& mel,
+                                                        size_t num_mels, size_t num_frames) {
+    std::vector<float> transposed(num_frames * num_mels);
+    for (size_t m = 0; m < num_mels; m++) {
+        for (size_t t = 0; t < num_frames; t++) {
+            transposed[t * num_mels + m] = mel[m * num_frames + t];
+        }
+    }
+    return transposed;
+}
+
 inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
     if (waveform.size() < 2 || coefficient == 0.0f) {
         return;
@@ -180,6 +251,56 @@ inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t val
     mel.swap(trimmed);
 }
 
+struct AudioPreprocessResult {
+    std::vector<float> features;
+    size_t num_frames = 0;
+    size_t num_soft_tokens = 0;
+};
+
+inline AudioPreprocessResult preprocess_audio_for_gemma4(
+    std::vector<float> audio_samples,
+    const cactus::engine::Config& model_config
+) {
+    AudioPreprocessResult result;
+    if (audio_samples.empty()) return result;
+
+    size_t pad_amt = 320 - (audio_samples.size() % 320);
+    if (pad_amt < 320)
+        audio_samples.resize(audio_samples.size() + pad_amt, 0.0f);
+
+    size_t mel_bins = model_config.audio_input_feat_size;
+    auto cfg = get_gemma4_audio_spectrogram_config(model_config);
+
+    size_t semicausal_pad = cfg.frame_length / 2;
+    audio_samples.insert(audio_samples.begin(), semicausal_pad, 0.0f);
+
+    cactus::engine::AudioProcessor ap;
+    size_t fft_for_mel = cfg.fft_override > 0 ? cfg.fft_override : cfg.n_fft;
+    ap.init_mel_filters(fft_for_mel / 2 + 1, mel_bins, 0.0f, 8000.0f, 16000,
+                        nullptr, "htk");
+    std::vector<float> mel = ap.compute_spectrogram(audio_samples, cfg);
+
+    result.num_frames = mel.size() / mel_bins;
+    result.features = transpose_mel_to_frame_major(mel, mel_bins, result.num_frames);
+
+    size_t after_stage1 = (result.num_frames + 1) / 2;
+    result.num_soft_tokens = (after_stage1 + 1) / 2;
+
+    return result;
+}
+
+inline std::vector<float> pcm_buffer_to_float_samples(
+    const uint8_t* pcm_buffer, size_t pcm_buffer_size
+) {
+    const int16_t* pcm_samples = reinterpret_cast<const int16_t*>(pcm_buffer);
+    size_t num_samples = pcm_buffer_size / 2;
+    std::vector<float> waveform_fp32(num_samples);
+    constexpr float inv_32768 = 1.0f / 32768.0f;
+    for (size_t i = 0; i < num_samples; i++)
+        waveform_fp32[i] = static_cast<float>(pcm_samples[i]) * inv_32768;
+    return waveform_fp32;
+}
+
 } // namespace audio
 } // namespace cactus
 
@@ -226,6 +347,24 @@ struct ToolFunction {
     std::unordered_map<std::string, std::string> parameters;
 };
 
+struct InferenceOptions {
+    float temperature = 0.0f;
+    float top_p = 0.0f;
+    float confidence_threshold = 0.7f;
+    size_t top_k = 0;
+    size_t max_tokens = 100;
+    size_t tool_rag_top_k = 2;
+    size_t cloud_timeout_ms = 15000;
+    std::vector<std::string> stop_sequences;
+    bool force_tools = false;
+    bool include_stop_sequences = false;
+    bool use_vad = true;
+    bool telemetry_enabled = true;
+    bool auto_handoff = true;
+    bool handoff_with_images = true;
+    bool enable_thinking_if_supported = true;
+};
+
 } // namespace ffi
 } // namespace cactus
 
@@ -262,6 +401,24 @@ inline std::string trim_string(const std::string& s) {
     return s.substr(start, end - start);
 }
 
+inline size_t find_matching_delimiter(const std::string& s, size_t pos, char open, char close) {
+    int depth = 1;
+    pos++;
+    while (pos < s.length() && depth > 0) {
+        if (s[pos] == open) depth++;
+        else if (s[pos] == close) depth--;
+        else if (s[pos] == '"') {
+            pos++;
+            while (pos < s.length() && s[pos] != '"') {
+                if (s[pos] == '\\') pos++;
+                pos++;
+            }
+        }
+        pos++;
+    }
+    return pos;
+}
+
 inline std::string env_or_default(const char* key, const char* fallback) {
     const char* v = std::getenv(key);
     if (v && v[0] != '\0') return std::string(v);
@@ -377,6 +534,119 @@ inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools)
     return oss.str();
 }
 
+namespace json_sorted {
+
+inline void skip_ws(const std::string& s, size_t& p) {
+    while (p < s.size() && std::isspace(static_cast<unsigned char>(s[p]))) p++;
+}
+
+inline std::string parse_string(const std::string& s, size_t& p) {
+    std::string r = "\"";
+    p++;
+    while (p < s.size()) {
+        if (s[p] == '\\') {
+            r += s[p++];
+            if (p < s.size()) r += s[p++];
+        } else if (s[p] == '"') {
+            r += '"';
+            p++;
+            return r;
+        } else {
+            r += s[p++];
+        }
+    }
+    return r;
+}
+
+inline std::string parse_value(const std::string& s, size_t& p);
+
+inline std::string parse_object(const std::string& s, size_t& p) {
+    p++;
+    std::map<std::string, std::string> entries;
+    skip_ws(s, p);
+    while (p < s.size() && s[p] != '}') {
+        if (s[p] == ',') { p++; skip_ws(s, p); continue; }
+        std::string key = parse_string(s, p);
+        skip_ws(s, p);
+        if (p < s.size() && s[p] == ':') p++;
+        skip_ws(s, p);
+        std::string val = parse_value(s, p);
+        entries[key] = val;
+        skip_ws(s, p);
+    }
+    if (p < s.size()) p++;
+    std::string r = "{";
+    bool first = true;
+    for (const auto& kv : entries) {
+        if (!first) r += ", ";
+        r += kv.first + ": " + kv.second;
+        first = false;
+    }
+    r += "}";
+    return r;
+}
+
+inline std::string parse_array(const std::string& s, size_t& p) {
+    p++;
+    std::vector<std::string> items;
+    skip_ws(s, p);
+    while (p < s.size() && s[p] != ']') {
+        if (s[p] == ',') { p++; skip_ws(s, p); continue; }
+        items.push_back(parse_value(s, p));
+        skip_ws(s, p);
+    }
+    if (p < s.size()) p++;
+    std::string r = "[";
+    for (size_t i = 0; i < items.size(); i++) {
+        if (i > 0) r += ", ";
+        r += items[i];
+    }
+    r += "]";
+    return r;
+}
+
+inline std::string parse_value(const std::string& s, size_t& p) {
+    skip_ws(s, p);
+    if (p >= s.size()) return "";
+    if (s[p] == '"') return parse_string(s, p);
+    if (s[p] == '{') return parse_object(s, p);
+    if (s[p] == '[') return parse_array(s, p);
+    size_t start = p;
+    while (p < s.size() && s[p] != ',' && s[p] != '}' && s[p] != ']' && !std::isspace(static_cast<unsigned char>(s[p]))) p++;
+    return s.substr(start, p - start);
+}
+
+inline std::string reformat(const std::string& json) {
+    size_t p = 0;
+    return parse_value(json, p);
+}
+
+} // namespace json_sorted
+
+inline std::string serialize_tools_for_template(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+    std::string result;
+    for (const auto& tool : tools) {
+        std::map<std::string, std::string> func_fields;
+        func_fields["\"description\""] = "\"" + escape_json_string(tool.description) + "\"";
+        func_fields["\"name\""] = "\"" + escape_json_string(tool.name) + "\"";
+        auto it = tool.parameters.find("schema");
+        if (it != tool.parameters.end()) {
+            func_fields["\"parameters\""] = json_sorted::reformat(it->second);
+        }
+        std::string func_json = "{";
+        bool first = true;
+        for (const auto& kv : func_fields) {
+            if (!first) func_json += ", ";
+            func_json += kv.first + ": " + kv.second;
+            first = false;
+        }
+        func_json += "}";
+        result += "\n{\"function\": " + func_json + ", \"type\": \"function\"}";
+    }
+    return result;
+}
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::ostringstream json;
     json << "{";
@@ -401,10 +671,12 @@ inline void handle_error_response(const std::string& error_message, char* respon
     }
 }
 
-inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json, 
-                                                                   std::vector<std::string>& out_image_paths) {
+inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
+                                                                   std::vector<std::string>& out_image_paths,
+                                                                   std::vector<std::string>* out_audio_paths = nullptr) {
     std::vector<cactus::engine::ChatMessage> messages;
     out_image_paths.clear();
+    if (out_audio_paths) out_audio_paths->clear();
     
     size_t pos = json.find('[');
     if (pos == std::string::npos) {
@@ -457,39 +729,111 @@ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::s
             }
         }
         
-        size_t images_pos = json.find("\"images\"", pos);
-        if (images_pos != std::string::npos && images_pos < obj_end) {
-            size_t array_start = json.find('[', images_pos);
-            if (array_start != std::string::npos && array_start < obj_end) {
-                size_t array_end = json.find(']', array_start);
-                if (array_end != std::string::npos && array_end < obj_end) {
-                    size_t img_pos = array_start;
-                    while (true) {
-                        img_pos = json.find('"', img_pos + 1);
-                        if (img_pos == std::string::npos || img_pos >= array_end) break;
-                        
-                        size_t img_start = img_pos + 1;
-                        size_t img_end = json.find('"', img_start);
-                        if (img_end == std::string::npos || img_end > array_end) break;
-                        
-                        std::string img_path = json.substr(img_start, img_end - img_start);
-                        
-                        std::filesystem::path p(img_path);
-                        img_path = std::filesystem::absolute(p).string();
-                        
-                        msg.images.push_back(img_path);
-                        out_image_paths.push_back(img_path);
-                        img_pos = img_end;
+        auto parse_path_array = [&](const char* key, std::vector<std::string>& dest,
+                                    std::vector<std::string>* out_paths) {
+            size_t key_pos = json.find(key, pos);
+            if (key_pos == std::string::npos || key_pos >= obj_end) return;
+            size_t array_start = json.find('[', key_pos);
+            if (array_start == std::string::npos || array_start >= obj_end) return;
+            size_t array_end = json.find(']', array_start);
+            if (array_end == std::string::npos || array_end >= obj_end) return;
+            size_t cur = array_start;
+            while (true) {
+                cur = json.find('"', cur + 1);
+                if (cur == std::string::npos || cur >= array_end) break;
+                size_t str_start = cur + 1;
+                size_t str_end = json.find('"', str_start);
+                if (str_end == std::string::npos || str_end > array_end) break;
+                std::string path = std::filesystem::absolute(
+                    std::filesystem::path(json.substr(str_start, str_end - str_start))).string();
+                dest.push_back(path);
+                if (out_paths) out_paths->push_back(path);
+                cur = str_end;
+            }
+        };
+
+        parse_path_array("\"images\"", msg.images, &out_image_paths);
+        parse_path_array("\"audio\"", msg.audio, out_audio_paths);
+
+        if (msg.role == "tool") {
+            size_t name_pos = json.find("\"name\"", obj_start);
+            if (name_pos != std::string::npos && name_pos < obj_end) {
+                size_t name_quote = json.find('"', name_pos + 6);
+                if (name_quote != std::string::npos && name_quote < obj_end) {
+                    size_t name_start = name_quote + 1;
+                    size_t name_end = json.find('"', name_start);
+                    if (name_end != std::string::npos && name_end < obj_end) {
+                        msg.name = json.substr(name_start, name_end - name_start);
                     }
                 }
             }
         }
-        
+
+        size_t tool_calls_pos = json.find("\"tool_calls\"", obj_start);
+        if (tool_calls_pos != std::string::npos && tool_calls_pos < obj_end) {
+            size_t tool_calls_arr_start = json.find('[', tool_calls_pos);
+            if (tool_calls_arr_start != std::string::npos && tool_calls_arr_start < obj_end) {
+                size_t tool_calls_arr_end = find_matching_delimiter(json, tool_calls_arr_start, '[', ']');
+
+                size_t search_pos = tool_calls_arr_start;
+                while (true) {
+                    size_t func_pos = json.find("\"function\"", search_pos);
+                    if (func_pos == std::string::npos || func_pos >= tool_calls_arr_end) break;
+
+                    size_t func_obj_start = json.find('{', func_pos + 10);
+                    if (func_obj_start == std::string::npos || func_obj_start >= tool_calls_arr_end) break;
+
+                    size_t func_obj_end = find_matching_delimiter(json, func_obj_start, '{', '}');
+
+                    cactus::engine::ToolCallInfo tool_call;
+
+                    size_t fn_name_pos = json.find("\"name\"", func_obj_start);
+                    if (fn_name_pos != std::string::npos && fn_name_pos < func_obj_end) {
+                        size_t fn_name_quote = json.find('"', fn_name_pos + 6);
+                        if (fn_name_quote != std::string::npos && fn_name_quote < func_obj_end) {
+                            size_t fn_name_start = fn_name_quote + 1;
+                            size_t fn_name_end = json.find('"', fn_name_start);
+                            if (fn_name_end != std::string::npos && fn_name_end < func_obj_end) {
+                                tool_call.name = json.substr(fn_name_start, fn_name_end - fn_name_start);
+                            }
+                        }
+                    }
+
+                    size_t args_pos = json.find("\"arguments\"", func_obj_start);
+                    if (args_pos != std::string::npos && args_pos < func_obj_end) {
+                        size_t colon_pos = json.find(':', args_pos + 11);
+                        if (colon_pos != std::string::npos && colon_pos < func_obj_end) {
+                            size_t args_start = colon_pos + 1;
+                            while (args_start < json.length() && std::isspace(static_cast<unsigned char>(json[args_start]))) args_start++;
+
+                            if (args_start < func_obj_end && json[args_start] == '{') {
+                                size_t args_end = find_matching_delimiter(json, args_start, '{', '}');
+                                tool_call.arguments = json.substr(args_start, args_end - args_start);
+                            } else if (args_start < func_obj_end && json[args_start] == '"') {
+                                size_t str_start = args_start + 1;
+                                size_t str_end = str_start;
+                                while (str_end < json.length() && json[str_end] != '"') {
+                                    if (json[str_end] == '\\') str_end++;
+                                    str_end++;
+                                }
+                                tool_call.arguments = json.substr(str_start, str_end - str_start);
+                            }
+                        }
+                    }
+
+                    if (!tool_call.name.empty()) {
+                        msg.tool_calls.push_back(tool_call);
+                    }
+                    search_pos = func_obj_end;
+                }
+            }
+        }
+
         messages.push_back(msg);
-        
+
         pos = json.find('{', obj_end);
     }
-    
+
     return messages;
 }
 
@@ -538,128 +882,433 @@ inline std::vector<ToolFunction> parse_tools_json(const std::string& json) {
         
         pos = json.find("\"function\"", name_pos);
     }
-    
+
     return tools;
 }
 
-inline void parse_options_json(const std::string& json,
-                               float& temperature, float& top_p,
-                               size_t& top_k, size_t& max_tokens,
-                               std::vector<std::string>& stop_sequences,
-                               bool& force_tools,
-                               size_t& tool_rag_top_k,
-                               float& confidence_threshold,
-                               bool& include_stop_sequences,
-                               bool& use_vad,
-                               bool& telemetry_enabled,
-                               bool* auto_handoff = nullptr,
-                               size_t* cloud_timeout_ms = nullptr,
-                               bool* handoff_with_images = nullptr) {
-    temperature = 0.0f;
-    top_p = 0.0f;
-    top_k = 0;
-    max_tokens = 100;
-    force_tools = false;
-    tool_rag_top_k = 2;
-    confidence_threshold = 0.7f;
-    include_stop_sequences = false;
-    use_vad = true;
-    telemetry_enabled = true;
-    if (auto_handoff) *auto_handoff = true;
-    if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
-    if (handoff_with_images) *handoff_with_images = true;
-    stop_sequences.clear();
+inline bool try_parse_json_float(const std::string& json, const std::string& key, float& out_value) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return false;
+
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+
+    size_t end = start;
+    while (end < json.size() && std::string(",}] \t\n\r").find(json[end]) == std::string::npos) ++end;
+
+    try {
+        out_value = std::stof(json.substr(start, end - start));
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+inline std::vector<std::string> parse_json_string_array_field(const std::string& json, const std::string& key) {
+    std::vector<std::string> out;
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return out;
+
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+    if (start >= json.size() || json[start] != '[') return out;
+
+    int depth = 1;
+    bool in_string = false;
+    bool escaped = false;
+    size_t end = start + 1;
+
+    while (end < json.size() && depth > 0) {
+        char c = json[end];
+        if (in_string) {
+            if (escaped) escaped = false;
+            else if (c == '\\') escaped = true;
+            else if (c == '"') in_string = false;
+        } else {
+            if (c == '"') in_string = true;
+            else if (c == '[') depth++;
+            else if (c == ']') depth--;
+        }
+        ++end;
+    }
+
+    if (depth != 0) return out;
+    const std::string array_json = json.substr(start, end - start);
+    if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
+
+    size_t i = 1;
+    while (i + 1 < array_json.size()) {
+        while (i + 1 < array_json.size() &&
+               (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) {
+            ++i;
+        }
+        if (i + 1 >= array_json.size() || array_json[i] == ']') break;
+        if (array_json[i] != '"') break;
+
+        ++i;
+        std::string value;
+        bool escaped = false;
+        while (i < array_json.size()) {
+            char c = array_json[i++];
+            if (escaped) {
+                switch (c) {
+                    case '"': value.push_back('"'); break;
+                    case '\\': value.push_back('\\'); break;
+                    case '/': value.push_back('/'); break;
+                    case 'b': value.push_back('\b'); break;
+                    case 'f': value.push_back('\f'); break;
+                    case 'n': value.push_back('\n'); break;
+                    case 'r': value.push_back('\r'); break;
+                    case 't': value.push_back('\t'); break;
+                    default: value.push_back(c); break;
+                }
+                escaped = false;
+                continue;
+            }
+            if (c == '\\') {
+                escaped = true;
+                continue;
+            }
+            if (c == '"') {
+                out.push_back(value);
+                break;
+            }
+            value.push_back(c);
+        }
+    }
 
+    return out;
+}
+
+inline void parse_custom_vocabulary_options(const std::string& json,
+                                            std::vector<std::string>& custom_vocabulary,
+                                            float& vocabulary_boost) {
+    custom_vocabulary.clear();
+    vocabulary_boost = 5.0f;
     if (json.empty()) return;
 
+    float parsed_boost = vocabulary_boost;
+    if (try_parse_json_float(json, "vocabulary_boost", parsed_boost)) {
+        vocabulary_boost = std::clamp(parsed_boost, 0.0f, 20.0f);
+    }
+
+    custom_vocabulary = parse_json_string_array_field(json, "custom_vocabulary");
+}
+
+inline std::unordered_map<uint32_t, float> build_token_bias_map(const std::vector<std::vector<uint32_t>>& tokenized_entries,
+                                                                float vocabulary_boost) {
+    std::unordered_map<uint32_t, float> vocab_bias;
+    const float clamped_boost = std::clamp(vocabulary_boost, 0.0f, 20.0f);
+    if (clamped_boost == 0.0f) return vocab_bias;
+
+    for (const auto& token_ids : tokenized_entries) {
+        for (uint32_t token_id : token_ids) {
+            float& entry = vocab_bias[token_id];
+            if (entry < clamped_boost) {
+                entry = clamped_boost;
+            }
+        }
+    }
+
+    return vocab_bias;
+}
+
+inline std::unordered_map<uint32_t, float> build_custom_vocabulary_bias(cactus::engine::Tokenizer* tokenizer,
+                                                                        const std::vector<std::string>& custom_vocabulary,
+                                                                        float vocabulary_boost) {
+    if (!tokenizer || custom_vocabulary.empty()) return {};
+    std::vector<std::vector<uint32_t>> tokenized_entries;
+    tokenized_entries.reserve(custom_vocabulary.size());
+
+    for (const auto& word : custom_vocabulary) {
+        if (word.empty()) continue;
+        tokenized_entries.push_back(tokenizer->encode(word));
+    }
+
+    return build_token_bias_map(tokenized_entries, vocabulary_boost);
+}
+
+inline void apply_custom_vocabulary_options(cactus::engine::Model* model, const std::string& json) {
+    if (!model) return;
+
+    std::vector<std::string> custom_vocabulary;
+    float vocabulary_boost = 5.0f;
+    parse_custom_vocabulary_options(json, custom_vocabulary, vocabulary_boost);
+    model->set_vocab_bias(build_custom_vocabulary_bias(model->get_tokenizer(), custom_vocabulary, vocabulary_boost));
+}
+
+inline size_t levenshtein_ci(const std::string& a, const std::string& b) {
+    const size_t m = a.size(), n = b.size();
+    std::vector<size_t> prev(n + 1), curr(n + 1);
+    for (size_t j = 0; j <= n; ++j) prev[j] = j;
+    for (size_t i = 1; i <= m; ++i) {
+        curr[0] = i;
+        for (size_t j = 1; j <= n; ++j) {
+            const bool match = std::tolower(static_cast<unsigned char>(a[i - 1])) ==
+                               std::tolower(static_cast<unsigned char>(b[j - 1]));
+            curr[j] = std::min({prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (match ? 0 : 1)});
+        }
+        std::swap(prev, curr);
+    }
+    return prev[n];
+}
+
+inline std::string collapse_spaces(const std::string& s) {
+    std::string out;
+    out.reserve(s.size());
+    for (char c : s) {
+        if (c != ' ') out += c;
+    }
+    return out;
+}
+
+inline void apply_vocabulary_spelling_correction(
+    std::string& text,
+    const std::vector<std::string>& custom_vocabulary)
+{
+    if (custom_vocabulary.empty() || text.empty()) return;
+
+    struct VocabEntry {
+        const std::string* original;
+        std::string collapsed;
+    };
+    std::vector<VocabEntry> vocab_entries;
+    vocab_entries.reserve(custom_vocabulary.size());
+    for (const auto& v : custom_vocabulary) {
+        vocab_entries.push_back({&v, collapse_spaces(v)});
+    }
+
+    struct Token { std::string text; bool is_word; };
+    std::vector<Token> tokens;
+    size_t pos = 0;
+    while (pos < text.size()) {
+        if (std::isalnum(static_cast<unsigned char>(text[pos])) ||
+            text[pos] == '\'' || text[pos] == '-') {
+            size_t start = pos;
+            while (pos < text.size() && (std::isalnum(static_cast<unsigned char>(text[pos])) ||
+                                          text[pos] == '\'' || text[pos] == '-')) {
+                ++pos;
+            }
+            tokens.push_back({text.substr(start, pos - start), true});
+        } else {
+            size_t start = pos;
+            while (pos < text.size() && !std::isalnum(static_cast<unsigned char>(text[pos])) &&
+                   text[pos] != '\'' && text[pos] != '-') {
+                ++pos;
+            }
+            tokens.push_back({text.substr(start, pos - start), false});
+        }
+    }
+
+    std::vector<size_t> word_indices;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        if (tokens[i].is_word) word_indices.push_back(i);
+    }
+
+    std::vector<bool> consumed(tokens.size(), false);
+
+    auto strip_suffix = [](const std::string& word) -> std::pair<std::string, std::string> {
+        if (word.size() >= 3 && word.substr(word.size() - 2) == "'s") {
+            return {word.substr(0, word.size() - 2), "'s"};
+        }
+        if (word.size() >= 3 && word.substr(word.size() - 2) == "'t") {
+            return {word.substr(0, word.size() - 2), "'t"};
+        }
+        if (word.size() >= 4 && word.back() == 's' &&
+            word[word.size() - 2] != 's' && // avoid stripping from "boss", "class"
+            std::isalpha(static_cast<unsigned char>(word[word.size() - 2]))) {
+            return {word.substr(0, word.size() - 1), "s"};
+        }
+        return {word, ""};
+    };
+
+    size_t wi = 0;
+    while (wi < word_indices.size()) {
+        size_t best_dist = std::numeric_limits<size_t>::max();
+        const std::string* best_match = nullptr;
+        size_t best_window = 0;
+        size_t best_first_token = 0;
+        size_t best_last_token = 0;
+        std::string best_suffix;
+
+        for (size_t window = std::min<size_t>(3, word_indices.size() - wi); window >= 1; --window) {
+            std::string window_collapsed;
+            const size_t first_tok = word_indices[wi];
+            const size_t last_tok = word_indices[wi + window - 1];
+            for (size_t w = 0; w < window; ++w) {
+                window_collapsed += tokens[word_indices[wi + w]].text;
+            }
+
+            if (window == 1 && window_collapsed.size() < 3) break;
+
+            auto [stem, suffix] = strip_suffix(window_collapsed);
+            const std::string* candidates[] = {&window_collapsed, &stem};
+            const std::string suffixes[] = {"", suffix};
+            const size_t num_candidates = suffix.empty() ? 1 : 2;
+
+            for (size_t ci = 0; ci < num_candidates; ++ci) {
+                const std::string& candidate = *candidates[ci];
+                if (candidate.empty()) continue;
+
+                for (const auto& entry : vocab_entries) {
+                    const size_t wlen = candidate.size();
+                    const size_t vlen = entry.collapsed.size();
+
+                    const size_t len_diff = wlen > vlen ? wlen - vlen : vlen - wlen;
+                    const size_t max_dist = std::max<size_t>(1, std::min(wlen, vlen) / 3);
+                    if (len_diff > max_dist) continue;
+
+                    const size_t dist = levenshtein_ci(candidate, entry.collapsed);
+
+                    // For single-edit corrections, require first char match to prevent
+                    // false positives like "vortex" → "Cortex".
+                    if (dist == 1 && window == 1) {
+                        const bool first_char_match =
+                            std::tolower(static_cast<unsigned char>(candidate[0])) ==
+                            std::tolower(static_cast<unsigned char>(entry.collapsed[0]));
+                        if (!first_char_match) continue;
+                    }
+
+                    if (dist <= max_dist && dist < best_dist) {
+                        best_dist = dist;
+                        best_match = entry.original;
+                        best_window = window;
+                        best_first_token = first_tok;
+                        best_last_token = last_tok;
+                        best_suffix = suffixes[ci];
+                    }
+                }
+            }
+
+            if (best_dist == 0) break;
+        }
+
+        // Allow dist==0 for multi-word merges where word boundaries changed.
+        const bool should_replace = best_match &&
+            best_dist != std::numeric_limits<size_t>::max() &&
+            (best_dist > 0 || best_window > 1);
+
+        if (should_replace) {
+            tokens[best_first_token].text = *best_match + best_suffix;
+            for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
+                consumed[t] = true;
+            }
+            for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
+                if (t > 0) consumed[t - 1] = consumed[t - 1] || !tokens[t - 1].is_word;
+            }
+            wi += best_window;
+        } else {
+            ++wi;
+        }
+    }
+
+    std::string result;
+    result.reserve(text.size());
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        if (!consumed[i]) {
+            result += tokens[i].text;
+        }
+    }
+
+    text = std::move(result);
+}
+
+inline InferenceOptions parse_inference_options_json(const std::string& json) {
+    InferenceOptions options;
+
+    if (json.empty()) return options;
+
     size_t pos = json.find("\"temperature\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        temperature = std::stof(json.substr(pos));
+        options.temperature = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"top_p\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        top_p = std::stof(json.substr(pos));
+        options.top_p = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"top_k\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        top_k = std::stoul(json.substr(pos));
+        options.top_k = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"max_tokens\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        max_tokens = std::stoul(json.substr(pos));
+        options.max_tokens = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"force_tools\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        force_tools = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.force_tools = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"tool_rag_top_k\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        tool_rag_top_k = std::stoul(json.substr(pos));
+        options.tool_rag_top_k = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"confidence_threshold\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        confidence_threshold = std::stof(json.substr(pos));
+        options.confidence_threshold = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"include_stop_sequences\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        include_stop_sequences = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.include_stop_sequences = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"use_vad\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        use_vad = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.use_vad = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"telemetry_enabled\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        telemetry_enabled = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.telemetry_enabled = (json.substr(pos, 4) == "true");
     }
 
-    if (auto_handoff) {
-        pos = json.find("\"auto_handoff\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            while (pos < json.length() && std::isspace(json[pos])) pos++;
-            *auto_handoff = (json.substr(pos, 4) == "true");
-        }
+    pos = json.find("\"auto_handoff\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.auto_handoff = (json.substr(pos, 4) == "true");
     }
 
-    if (cloud_timeout_ms) {
-        pos = json.find("\"cloud_timeout_ms\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            *cloud_timeout_ms = std::stoul(json.substr(pos));
-        }
+    pos = json.find("\"cloud_timeout_ms\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        options.cloud_timeout_ms = std::stoul(json.substr(pos));
     }
 
-    if (handoff_with_images) {
-        pos = json.find("\"handoff_with_images\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            while (pos < json.length() && std::isspace(json[pos])) pos++;
-            *handoff_with_images = (json.substr(pos, 4) == "true");
-        }
+    pos = json.find("\"handoff_with_images\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.handoff_with_images = (json.substr(pos, 4) == "true");
+    }
+
+    pos = json.find("\"enable_thinking_if_supported\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.enable_thinking_if_supported = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"stop_sequences\"");
@@ -673,12 +1322,14 @@ inline void parse_options_json(const std::string& json,
                 size_t seq_start = seq_pos + 1;
                 size_t seq_end = json.find('"', seq_start);
                 if (seq_end != std::string::npos) {
-                    stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
+                    options.stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
                 }
                 seq_pos = json.find('"', seq_end + 1);
             }
         }
     }
+
+    return options;
 }
 
 static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
@@ -755,7 +1406,6 @@ inline void parse_function_calls_from_response(const std::string& response_text,
 
     gemma::parse_function_calls(regular_response, function_calls);
 
-    // Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
     const std::string QWEN_TOOL_START = "<tool_call>";
     const std::string QWEN_TOOL_END = "</tool_call>";
     size_t qwen_start_pos = 0;
@@ -764,27 +1414,62 @@ inline void parse_function_calls_from_response(const std::string& response_text,
         size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
         size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
 
+        size_t erase_end;
+        std::string json_content;
+
         if (qwen_end_pos != std::string::npos) {
-            std::string json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+            json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+            erase_end = qwen_end_pos + QWEN_TOOL_END.length();
+        } else {
+            json_content = regular_response.substr(content_start);
+            erase_end = regular_response.length();
+        }
 
-            size_t first = json_content.find_first_not_of(" \t\n\r");
-            size_t last = json_content.find_last_not_of(" \t\n\r");
-            if (first != std::string::npos && last != std::string::npos) {
-                json_content = json_content.substr(first, last - first + 1);
-            }
+        size_t first = json_content.find_first_not_of(" \t\n\r");
+        size_t last = json_content.find_last_not_of(" \t\n\r");
+        if (first != std::string::npos && last != std::string::npos) {
+            json_content = json_content.substr(first, last - first + 1);
+        }
 
-            if (json_content.size() > 2 && json_content[0] == '{' &&
-                json_content.find("\"name\"") != std::string::npos) {
-                function_calls.push_back(json_content);
+        if (json_content.size() > 2 && json_content[0] == '{' &&
+            json_content.find("\"name\"") != std::string::npos) {
+            size_t depth = 0;
+            bool in_string = false;
+            bool escaped = false;
+            size_t end_pos = 0;
+            for (size_t c = 0; c < json_content.size(); c++) {
+                char ch = json_content[c];
+                if (escaped) {
+                    escaped = false;
+                    continue;
+                }
+                if (ch == '\\' && in_string) {
+                    escaped = true;
+                    continue;
+                }
+                if (ch == '"') {
+                    in_string = !in_string;
+                    continue;
+                }
+                if (!in_string) {
+                    if (ch == '{') depth++;
+                    else if (ch == '}') {
+                        depth--;
+                        if (depth == 0) {
+                            end_pos = c + 1;
+                            break;
+                        }
+                    }
+                }
+            }
+            if (end_pos > 0) {
+                function_calls.push_back(json_content.substr(0, end_pos));
             }
-
-            regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
-        } else {
-            break;
         }
+
+        regular_response.erase(qwen_start_pos, erase_end - qwen_start_pos);
     }
-    
-    // Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
+
     const std::string TOOL_CALL_START = "<|tool_call_start|>";
     const std::string TOOL_CALL_END = "<|tool_call_end|>";
     size_t tool_start_pos = 0;
@@ -898,6 +1583,95 @@ inline void parse_function_calls_from_response(const std::string& response_text,
     }
 }
 
+inline std::vector<std::pair<size_t, size_t>> find_channel_token_ranges(
+    const std::vector<uint32_t>& tokens, size_t offset,
+    uint32_t channel_open_id, uint32_t channel_close_id) {
+    std::vector<std::pair<size_t, size_t>> ranges;
+    size_t pos = 0;
+    while (pos < tokens.size()) {
+        if (tokens[pos] != channel_open_id) {
+            pos++;
+            continue;
+        }
+
+        size_t block_start = pos;
+        pos++;
+        while (pos < tokens.size() && tokens[pos] != channel_close_id) {
+            pos++;
+        }
+        if (pos < tokens.size()) {
+            pos++;
+        }
+        ranges.push_back({offset + block_start, pos - block_start});
+    }
+    return ranges;
+}
+
+inline void strip_tag_blocks(std::string& text, std::string& extracted,
+                             const std::string& open_tag, const std::string& close_tag) {
+    std::string result;
+    size_t pos = 0;
+
+    size_t first_close = text.find(close_tag);
+    size_t first_open = text.find(open_tag);
+    if (first_close != std::string::npos &&
+        (first_open == std::string::npos || first_close < first_open)) {
+        extracted += text.substr(0, first_close);
+        pos = first_close + close_tag.size();
+    }
+
+    while (pos < text.size()) {
+        size_t open_pos = text.find(open_tag, pos);
+        if (open_pos == std::string::npos) {
+            result += text.substr(pos);
+            break;
+        }
+        result += text.substr(pos, open_pos - pos);
+        size_t content_start = open_pos + open_tag.size();
+        size_t close_pos = text.find(close_tag, content_start);
+        if (close_pos == std::string::npos) {
+            if (!extracted.empty()) extracted += "\n";
+            extracted += text.substr(content_start);
+            break;
+        }
+        if (!extracted.empty()) extracted += "\n";
+        extracted += text.substr(content_start, close_pos - content_start);
+        pos = close_pos + close_tag.size();
+    }
+    text = result;
+}
+
+inline void strip_thinking_block(const std::string& input, std::string& thinking, std::string& content) {
+    thinking.clear();
+    content = input;
+
+    auto trim = [](std::string& s) {
+        size_t first = s.find_first_not_of(" \t\n\r");
+        size_t last = s.find_last_not_of(" \t\n\r");
+        if (first != std::string::npos && last != std::string::npos)
+            s = s.substr(first, last - first + 1);
+        else
+            s.clear();
+    };
+
+    if (content.find("<|channel>") != std::string::npos || content.find("<channel|>") != std::string::npos) {
+        strip_tag_blocks(content, thinking, "<|channel>", "<channel|>");
+    } else if (content.find("<think>") != std::string::npos || content.find("</think>") != std::string::npos) {
+        strip_tag_blocks(content, thinking, "<think>", "</think>");
+    } else {
+        return;
+    }
+
+    trim(thinking);
+    trim(content);
+}
+
+struct TranscriptSegment {
+    float start;
+    float end;
+    std::string text;
+};
+
 inline std::string construct_response_json(const std::string& regular_response,
                                            const std::vector<std::string>& function_calls,
                                            double time_to_first_token,
@@ -907,19 +1681,32 @@ inline std::string construct_response_json(const std::string& regular_response,
                                            size_t prompt_tokens,
                                            size_t completion_tokens,
                                            float confidence = 0.0f,
-                                           bool cloud_handoff = false) {
+                                           bool cloud_handoff = false,
+                                           const std::string& thinking = "",
+                                           const std::vector<TranscriptSegment>& segments = {}) {
     std::ostringstream json;
     json << "{";
     json << "\"success\":true,";
     json << "\"error\":null,";
     json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
     json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
+    if (!thinking.empty()) {
+        json << "\"thinking\":\"" << escape_json_string(thinking) << "\",";
+    }
     json << "\"function_calls\":[";
     for (size_t i = 0; i < function_calls.size(); ++i) {
         if (i > 0) json << ",";
         json << function_calls[i];
     }
     json << "],";
+    json << "\"segments\":[";
+    for (size_t i = 0; i < segments.size(); ++i) {
+        if (i > 0) json << ",";
+        json << "{\"start\":" << std::fixed << std::setprecision(3) << segments[i].start
+             << ",\"end\":" << std::fixed << std::setprecision(3) << segments[i].end
+             << ",\"text\":\"" << escape_json_string(segments[i].text) << "\"}";
+    }
+    json << "],";
     json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
     json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
     json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << total_time_ms << ",";
@@ -945,6 +1732,50 @@ inline std::string serialize_function_calls(const std::vector<std::string>& call
     return oss.str();
 }
 
+inline int validate_audio_params(
+    const char* component,
+    void* model,
+    char* response_buffer, size_t buffer_size,
+    const char* audio_file_path,
+    const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
+    if (!model) {
+        std::string err = last_error_message.empty() ? "Model not initialized." : last_error_message;
+        CACTUS_LOG_ERROR(component, err);
+        handle_error_response(err, response_buffer, buffer_size);
+        return -1;
+    }
+    if (!response_buffer || buffer_size == 0) {
+        CACTUS_LOG_ERROR(component, "Invalid parameters: response_buffer or buffer_size");
+        handle_error_response("Invalid parameters", response_buffer, buffer_size);
+        return -1;
+    }
+    if (!audio_file_path && (!pcm_buffer || pcm_buffer_size == 0)) {
+        CACTUS_LOG_ERROR(component, "No audio input provided");
+        handle_error_response("Either audio_file_path or pcm_buffer must be provided", response_buffer, buffer_size);
+        return -1;
+    }
+    if (audio_file_path && pcm_buffer && pcm_buffer_size > 0) {
+        CACTUS_LOG_ERROR(component, "Both audio_file_path and pcm_buffer provided");
+        handle_error_response("Cannot provide both audio_file_path and pcm_buffer", response_buffer, buffer_size);
+        return -1;
+    }
+    if (pcm_buffer && pcm_buffer_size > 0 && (pcm_buffer_size < 2 || pcm_buffer_size % 2 != 0)) {
+        CACTUS_LOG_ERROR(component, "Invalid pcm_buffer_size");
+        handle_error_response("pcm_buffer_size must be even and at least 2 bytes", response_buffer, buffer_size);
+        return -1;
+    }
+    return 0;
+}
+
+inline std::vector<float> pcm_to_float(const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
+    const int16_t* samples = reinterpret_cast<const int16_t*>(pcm_buffer);
+    size_t n = pcm_buffer_size / 2;
+    std::vector<float> out(n);
+    for (size_t i = 0; i < n; ++i)
+        out[i] = static_cast<float>(samples[i]) / 32768.0f;
+    return out;
+}
+
 } // namespace ffi
 } // namespace cactus
 
@@ -958,4 +1789,4 @@ const char* cactus_get_last_error();
 }
 #endif
 
-#endif // CACTUS_UTILS_H
\ No newline at end of file
+#endif // CACTUS_UTILS_H
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
index c8bf34a..f686fd7 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
@@ -75,6 +75,7 @@ struct Config {
     bool use_pixel_shuffle = false;
     uint32_t pixel_shuffle_factor = 1;
     bool use_image_tokens = false;
+    uint32_t image_token_id = 0;
     bool use_layout_tags = false;
     uint32_t image_seq_len = 64;
 
@@ -107,6 +108,26 @@ struct Config {
     uint32_t subsampling_factor = 0;
     uint32_t num_mel_bins = 80;
     std::string encoder_hidden_act = "silu";
+    uint32_t linear_num_key_heads = 0;
+    uint32_t linear_key_head_dim = 0;
+    uint32_t linear_num_value_heads = 0;
+    uint32_t linear_value_head_dim = 0;
+    uint32_t linear_q_proj_dim = 0;
+    uint32_t linear_k_proj_dim = 0;
+    uint32_t linear_v_proj_dim = 0;
+
+    uint32_t kv_lora_rank = 0;
+    uint32_t q_lora_rank = 0;
+    uint32_t qk_head_dim = 0;
+    uint32_t qk_nope_head_dim = 0;
+    uint32_t qk_rope_head_dim = 0;
+    uint32_t v_head_dim = 0;
+    uint32_t rope_interleave = 0;
+    bool attention_bias = false;
+    float rope_scaling_factor = 1.0f;
+    float rope_mscale_all_dim = 0.0f;
+
+    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, QWEN3P5 = 11, PARAKEET_TDT = 12, GEMMA3N = 13, YOUTU = 14, GEMMA4 = 15, PYANNOTE = 16, WESPEAKER = 17};
     uint32_t predictor_hidden_dim = 0;
     uint32_t predictor_num_layers = 0;
     uint32_t tdt_joint_dim = 0;
@@ -114,7 +135,6 @@ struct Config {
     uint32_t tdt_blank_id = 0;
     std::vector<uint32_t> tdt_durations;
 
-    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
     ModelType model_type = ModelType::QWEN;
 
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -138,6 +158,58 @@ struct Config {
     std::vector<std::string> layer_types;
     size_t conv_L_cache = 0;
 
+    uint32_t altup_num_inputs = 4;
+    uint32_t laurel_rank = 64;
+    uint32_t hidden_size_per_layer_input = 256;
+    uint32_t num_kv_shared_layers = 0;
+    uint32_t sliding_window = 512;
+    float rope_local_base_freq = 10000.0f;
+    float final_logit_softcapping = 0.0f;
+    float global_partial_rotary_factor = 1.0f;
+    uint32_t expert_intermediate_size = 0;
+    uint32_t global_head_dim = 0;
+    uint32_t num_global_kv_heads = 0;
+    bool attention_k_eq_v = false;
+    bool enable_moe_block = false;
+    std::vector<float> activation_sparsity_ppf;
+
+    uint32_t vision_head_dim = 64;
+    uint32_t vision_kv_heads = 12;
+    uint32_t vision_intermediate_size = 3072;
+    uint32_t vision_position_embedding_size = 10240;
+    uint32_t vision_pooling_kernel_size = 3;
+    uint32_t vision_default_output_length = 280;
+    float vision_rope_theta = 100.0f;
+
+    uint32_t audio_hidden_dim = 0;
+    uint32_t audio_num_layers = 0;
+    uint32_t audio_num_heads = 0;
+    uint32_t audio_head_dim = 0;
+    uint32_t audio_input_feat_size = 128;
+    uint32_t audio_conf_conv_kernel_size = 5;
+    uint32_t audio_chunk_size = 12;
+    uint32_t audio_context_left = 13;
+    uint32_t audio_context_right = 0;
+    float audio_logit_cap = 50.0f;
+    float audio_residual_weight = 0.5f;
+    uint32_t audio_output_proj_dims = 0;
+    uint32_t audio_vocab_size = 128;
+    uint32_t audio_vocab_offset = 0;
+    uint32_t audio_soft_tokens = 188;
+    uint32_t audio_sscp_conv0_channels = 128;
+    uint32_t audio_sscp_conv1_channels = 32;
+    float audio_sscp_conv_eps = 1e-3f;
+    float audio_rms_norm_eps = 1e-6f;
+    uint32_t audio_fft_length = 1024;
+    uint32_t audio_token_id = 0;
+    bool audio_fft_overdrive = false;
+    uint32_t channel_open_token_id = 100;
+    uint32_t channel_close_token_id = 101;
+
+    static bool is_gemma_family(ModelType t) {
+        return t == ModelType::GEMMA || t == ModelType::GEMMA3N || t == ModelType::GEMMA4;
+    }
+
     bool from_json(const std::string& json_path);
     std::string to_json() const;
 };
@@ -155,14 +227,38 @@ struct MergeRule {
 };
 
 
+struct ToolCallInfo {
+    std::string name;
+    std::string arguments;
+};
+
 struct ChatMessage {
     std::string role;
     std::string content;
     std::string name;
     std::vector<std::string> images;
+    std::vector<std::string> audio;
+    size_t audio_soft_token_count = 0;
+    std::vector<ToolCallInfo> tool_calls;
 };
 
+struct TokenizerRuntimeConfig {
+    enum class TokenizerType { UNKNOWN, BPE, SENTENCEPIECE };
+    enum class VocabFormat { UNKNOWN, ID_TAB_TOKEN, LINE_TOKEN };
+    enum class Normalizer { NONE, METASPACE, BYTE_LEVEL };
+    enum class Decoder { NONE, REPLACE_METASPACE, BYTE_LEVEL };
+
+    TokenizerType tokenizer_type = TokenizerType::UNKNOWN;
+    VocabFormat vocab_format = VocabFormat::UNKNOWN;
+    Normalizer normalizer = Normalizer::NONE;
+    Decoder decoder = Decoder::NONE;
+    bool byte_fallback = false;
+    bool has_chat_template = false;
+};
 
+TokenizerRuntimeConfig load_tokenizer_runtime_config(const std::string& config_file);
+void load_special_tokens_map(const std::string& config_file, std::unordered_map<std::string, uint32_t>& special_tokens);
+std::vector<std::string> split_with_special_tokens(const std::string& text, const std::unordered_map<std::string, uint32_t>& special_tokens);
 
 class Tokenizer {
 public:
@@ -172,7 +268,7 @@ class Tokenizer {
     virtual std::string decode(const std::vector<uint32_t>& tokens) const = 0;
 
     virtual std::vector<uint32_t> apply_chat_template(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true) const;
-    virtual std::string format_chat_prompt(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true, const std::string& tools_json = "") const;
+    virtual std::string format_chat_prompt(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true, const std::string& tools_json = "", bool enable_thinking_if_supported = true) const;
 
     virtual uint32_t get_vocab_size() const = 0;
     virtual uint32_t get_unk_token() const = 0;
@@ -188,7 +284,7 @@ class Tokenizer {
     uint32_t get_global_img_token_id() const { return global_img_token_id_; }
 
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
+    enum class ModelType { UNKNOWN, QWEN, QWEN3P5, GEMMA, GEMMA4, LFM2, BERT, WHISPER, PARAKEET, YOUTU};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -199,11 +295,21 @@ class Tokenizer {
     uint32_t fake_token_id_ = 49189;
     uint32_t global_img_token_id_ = 49152;
 
+
+    uint32_t vision_patch_size_ = 16;
+    uint32_t vision_pooling_kernel_size_ = 3;
+    uint32_t vision_default_output_length_ = 280;
+    uint32_t vision_image_size_ = 768;
+    TokenizerRuntimeConfig runtime_config_;
+
     void detect_model_type(const std::string& config_path);
-    std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    void load_chat_template(const std::string& template_file);
+    std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json, bool enable_thinking_if_supported = true) const;
     std::string format_gemma_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    std::string format_gemma4_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json, bool enable_thinking_if_supported = true) const;
     std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    std::string format_youtu_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
 };
 
 class BPETokenizer : public Tokenizer {
@@ -245,6 +351,7 @@ class BPETokenizer : public Tokenizer {
     std::string bytes_to_unicode(const std::string& text) const;
     std::string unicode_to_bytes(const std::string& text) const;
     std::vector<std::string> byte_level_split(const std::string& text) const;
+    std::vector<std::string> utf8_split(const std::string& text) const;
 
     void cleanup_mmap();
     
@@ -256,12 +363,6 @@ class BPETokenizer : public Tokenizer {
     std::unordered_map<std::string, uint32_t> special_tokens_;
     std::vector<std::string> split_with_special_tokens(const std::string& text) const;
     void load_special_tokens(const std::string& config_file);
-
-    void load_chat_template(const std::string& template_file);
-
-    std::unordered_map<std::string, uint32_t> tool_tokens_;
-    bool has_tool_support_;
-    void load_tokenizer_config(const std::string& config_file);
 };
 
 class SPTokenizer : public Tokenizer {
@@ -311,8 +412,6 @@ class SPTokenizer : public Tokenizer {
     std::unordered_map<std::string, uint32_t> special_tokens_;
     std::vector<std::string> split_with_special_tokens(const std::string& text) const;
     void load_special_tokens(const std::string& config_file);
-
-    void load_chat_template(const std::string& template_file);
 };
 
 class ConvCache {
@@ -355,8 +454,10 @@ struct KVCache {
     struct LayerCache {
         std::vector<uint8_t> keys;
         std::vector<uint8_t> values;
-        std::vector<float> key_scales;   
-        std::vector<float> value_scales; 
+        std::vector<float> key_scales;
+        std::vector<float> value_scales;
+        size_t head_dim = 0;
+        size_t kv_heads = 0;
     };
 
     std::vector<LayerCache> layer_caches;
@@ -366,8 +467,6 @@ struct KVCache {
     size_t current_seq_len = 0;
     size_t total_seq_len = 0;
     size_t max_seq_len = 2048;
-    size_t num_kv_heads = 0;
-    size_t head_dim = 0;
     size_t num_layers = 0;
     Precision precision;
     size_t element_size = 4;
@@ -375,12 +474,14 @@ struct KVCache {
     void set_window_size(size_t window, size_t sink = DEFAULT_SINK_SIZE);
     size_t get_effective_seq_len() const { return current_seq_len; }
     size_t get_total_seq_len() const { return total_seq_len; }
+    size_t get_layer_head_dim(size_t layer_idx) const { return layer_caches[layer_idx].head_dim; }
+    size_t get_layer_kv_heads(size_t layer_idx) const { return layer_caches[layer_idx].kv_heads; }
 
-    void init(size_t num_layers, size_t max_seq, size_t num_kv_heads, size_t head_dim, Precision model_precision);
+    void init(size_t num_layers, size_t max_seq, const std::vector<size_t>& layer_dims, const std::vector<size_t>& layer_kv_heads, Precision model_precision);
     void reset();
     void update_from_graph(CactusGraph* gb, const std::vector<size_t>& k_nodes,
                           const std::vector<size_t>& v_nodes, size_t seq_len,
-                          size_t num_layers, size_t kv_heads, size_t head_dim);
+                          size_t num_layers);
 
     void update_from_npu(size_t layer_idx, const __fp16* k_data, const __fp16* v_data,
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
@@ -404,6 +505,9 @@ struct KVCache {
     const int8_t* get_values_int8(size_t layer) const;
     const float* get_key_scales(size_t layer) const;
     const float* get_value_scales(size_t layer) const;
+
+    void remove_token_range(size_t start, size_t count);
+    void compact_to_windows(const std::vector<size_t>& target_windows);
 };
 
 class ToolCallConstrainer {
@@ -421,7 +525,7 @@ class ToolCallConstrainer {
         QWEN_EXPECT_ARGS_COLON, 
         QWEN_IN_ARGUMENTS,  
         QWEN_EXPECT_CLOSE_BRACE,
-        QWEN_EXPECT_END, 
+        QWEN_EXPECT_END,
 
         LFM_START,              
         LFM_EXPECT_BRACKET, 
@@ -457,12 +561,17 @@ class ToolCallConstrainer {
     Config::ModelType model_type_ = Config::ModelType::QWEN;
     Tokenizer* tokenizer_ = nullptr;
 
+    bool is_gemma_family() const { return Config::is_gemma_family(model_type_); }
+
     std::vector<std::string> function_names_;
     std::string generated_text_;
-    int brace_depth_ = 0;  
+    int brace_depth_ = 0;
+
+    std::string call_start_tag_;
+    std::string call_end_tag_;
 
-    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_; 
-    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;   
+    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_;
+    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;
     std::unordered_set<uint32_t> open_brace_tokens_;         
     std::unordered_set<uint32_t> close_brace_tokens_;       
     std::unordered_set<uint32_t> colon_tokens_;            
@@ -472,7 +581,7 @@ class ToolCallConstrainer {
     std::unordered_set<uint32_t> quote_tokens_;            
     std::unordered_set<uint32_t> backtick_tokens_;   
     std::unordered_set<uint32_t> all_func_name_tokens_;
-    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  
+    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;
 
     std::unordered_set<uint32_t> tool_start_tokens_;
     std::unordered_set<uint32_t> tool_end_tokens_;
@@ -523,12 +632,16 @@ class Model {
 
     virtual void prefill(const std::vector<uint32_t>& tokens, size_t chunk_size = 256, const std::string& profile_file = "");
 
+    virtual void prefill_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
+                                     const std::string& profile_file = "");
+
     virtual uint32_t decode_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
                                           float temperature = -1.0f, float top_p = -1.0f,
                                           size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
 
     virtual uint32_t decode_with_audio(const std::vector<uint32_t>& tokens, const std::vector<float>& audio_features, float temperature = 0.0f, float top_p = 0.0f,
-                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
+                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr,
+                      float* out_token_time_start = nullptr, float* out_token_time_end = nullptr);
 
     std::vector<float> get_embeddings(const std::vector<uint32_t>& tokens, bool pooled = true, bool normalize = false, const std::string& profile_file = "");
     
@@ -548,13 +661,37 @@ class Model {
     bool has_npu_prefill() const;
     size_t get_prefill_chunk_size() const;
 
+    virtual void remove_thinking_tokens(const std::vector<std::pair<size_t, size_t>>& ranges);
+    virtual void compact_kv_cache() {}
+
     void set_tool_constraints(const std::vector<std::string>& function_names);
     void clear_tool_constraints();
     void update_tool_constraints(uint32_t token_id);
 
     void* graph_handle_;
 
+    void set_vocab_bias(const std::unordered_map<uint32_t, float>& bias) {
+        vocab_bias_ = bias;
+    }
+
+    void clear_vocab_bias() {
+        vocab_bias_.clear();
+    }
+
+    bool has_vocab_bias() const {
+        return !vocab_bias_.empty();
+    }
+
+    const std::unordered_map<uint32_t, float>& get_vocab_bias() const {
+        return vocab_bias_;
+    }
+
 protected:
+    size_t sample_token(CactusGraph* gb, size_t logits_node_id, float temperature, float top_p, size_t top_k,
+                        const std::unordered_map<uint32_t, float>* extra_bias = nullptr) const;
+
+    static void compute_entropy(CactusGraph* gb, size_t logits_node_id, float* out_entropy);
+
     virtual size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) = 0;
     
     virtual size_t forward(const std::vector<float>& audio_features, const std::vector<uint32_t>& tokens, bool use_cache = false);
@@ -569,6 +706,12 @@ class Model {
     virtual size_t build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
                                   ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) = 0;
     void update_kv_cache(CactusGraph* gb, size_t seq_len);
+    virtual std::vector<size_t> get_kv_layer_dims() const {
+        return std::vector<size_t>(config_.num_layers, config_.attention_head_dim);
+    }
+    virtual std::vector<size_t> get_kv_layer_heads() const {
+        return std::vector<size_t>(config_.num_layers, config_.attention_kv_heads);
+    }
     virtual void post_init() {}
     virtual void post_execute_updates(CactusGraph*, size_t) {}
     Config config_;
@@ -601,6 +744,9 @@ class Model {
     virtual std::vector<__fp16> get_token_embeddings(const std::vector<uint32_t>& tokens);
 
     ToolCallConstrainer tool_constrainer_;
+
+private:
+    std::unordered_map<uint32_t, float> vocab_bias_;
 };
 
 std::unique_ptr<Model> create_model(const std::string& model_folder);
@@ -705,13 +851,17 @@ class AudioProcessor {
         bool remove_dc_offset = false;
         float preemphasis = 0.0f;
         bool hann_periodic = true;
+        float window_a0 = 0.5f;
+        size_t fft_override = 0;
+        bool mel_floor_additive = false;
     };
 
     AudioProcessor();
     ~AudioProcessor();
 
     void init_mel_filters(size_t num_frequency_bins, size_t num_mel_filters,
-                          float min_freq, float max_freq, size_t sampling_rate);
+                          float min_freq, float max_freq, size_t sampling_rate,
+                          const char* norm = "slaney", const char* mel_scale = "slaney");
 
     std::vector<float> compute_spectrogram(
         const std::vector<float>& waveform,
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h
index 912de57..f0f9fe2 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h
@@ -53,6 +53,7 @@ inline std::string format_argument(const std::string& json, size_t& pos, bool es
     char c = json[pos];
 
     if (c == '"') {
+        pos++;
         std::string value = extract_json_string(json, pos);
         return escape(value);
     } else if (c == '{') {
@@ -240,7 +241,7 @@ inline std::string format_parameters(const std::string& properties_json, const s
                 result += ",properties:{" + format_parameters(prop_obj["properties"], nested_required) + "}";
             }
             if (prop_obj.count("required")) {
-                result += ",required:[";
+                std::string req_items;
                 size_t req_pos = 0;
                 skip_whitespace(prop_obj["required"], req_pos);
                 if (req_pos < prop_obj["required"].length() && prop_obj["required"][req_pos] == '[') {
@@ -253,13 +254,15 @@ inline std::string format_parameters(const std::string& properties_json, const s
                         if (prop_obj["required"][req_pos] == '"') {
                             req_pos++;
                             std::string req_item = extract_json_string(prop_obj["required"], req_pos);
-                            if (!req_first) result += ",";
+                            if (!req_first) req_items += ",";
                             req_first = false;
-                            result += escape(req_item);
+                            req_items += escape(req_item);
                         }
                     }
                 }
-                result += "]";
+                if (!req_items.empty()) {
+                    result += ",required:[" + req_items + "]";
+                }
             }
         } else if (to_upper(type_val) == "ARRAY") {
             if (prop_obj.count("items")) {
@@ -342,7 +345,7 @@ inline std::string format_function_declaration(const std::string& name,
         }
 
         if (params.count("required")) {
-            result += ",required:[";
+            std::string req_items;
             size_t req_pos = 0;
             skip_whitespace(params["required"], req_pos);
             if (req_pos < params["required"].length() && params["required"][req_pos] == '[') {
@@ -355,13 +358,15 @@ inline std::string format_function_declaration(const std::string& name,
                     if (params["required"][req_pos] == '"') {
                         req_pos++;
                         std::string item = extract_json_string(params["required"], req_pos);
-                        if (!first) result += ",";
+                        if (!first) req_items += ",";
                         first = false;
-                        result += escape(item);
+                        req_items += escape(item);
                     }
                 }
             }
-            result += "]";
+            if (!req_items.empty()) {
+                result += ",required:[" + req_items + "]";
+            }
         }
 
         if (params.count("type")) {
@@ -377,12 +382,15 @@ inline std::string format_function_declaration(const std::string& name,
 }
 
 template<typename ToolFunction>
-inline std::string format_tools(const std::vector<ToolFunction>& tools) {
+inline std::string format_tools(const std::vector<ToolFunction>& tools, bool use_pipe_tags = false) {
     if (tools.empty()) return "";
 
+    const char* decl_start = use_pipe_tags ? "<|tool>" : "<start_function_declaration>";
+    const char* decl_end   = use_pipe_tags ? "<tool|>" : "<end_function_declaration>";
+
     std::string result;
     for (const auto& tool : tools) {
-        result += "<start_function_declaration>";
+        result += decl_start;
         std::string params_json;
         auto it = tool.parameters.find("schema");
         if (it != tool.parameters.end()) {
@@ -390,12 +398,26 @@ inline std::string format_tools(const std::vector<ToolFunction>& tools) {
         }
 
         result += format_function_declaration(tool.name, tool.description, params_json);
-        result += "<end_function_declaration>";
+        result += decl_end;
     }
     return result;
 }
 
 
+inline size_t match_quote_tag(const std::string& s, size_t pos) {
+    if (s.compare(pos, 8, "<escape>") == 0) return 8;
+    if (s.compare(pos, 5, "<|\"|>") == 0) return 5;
+    return 0;
+}
+
+inline size_t find_quote_tag(const std::string& s, size_t pos) {
+    size_t e = s.find("<escape>", pos);
+    size_t t = s.find("<|\"|>", pos);
+    if (e == std::string::npos) return t;
+    if (t == std::string::npos) return e;
+    return std::min(e, t);
+}
+
 inline std::string unescape(const std::string& s) {
     const std::string ESCAPE_TAG = "<escape>";
     std::string result = s;
@@ -427,12 +449,13 @@ inline std::string args_to_json(const std::string& args_content) {
         while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
 
         if (pos < args_content.length()) {
-            if (args_content.compare(pos, 8, "<escape>") == 0) {
-                pos += 8; 
-                size_t val_end = args_content.find("<escape>", pos);
+            size_t qtag_len = match_quote_tag(args_content, pos);
+            if (qtag_len > 0) {
+                pos += qtag_len;
+                size_t val_end = find_quote_tag(args_content, pos);
                 if (val_end != std::string::npos) {
                     value = "\"" + args_content.substr(pos, val_end - pos) + "\"";
-                    pos = val_end + 8; 
+                    pos = val_end + match_quote_tag(args_content, val_end);
                 }
             } else if (args_content[pos] == '{') {
                 int depth = 1;
@@ -464,12 +487,13 @@ inline std::string args_to_json(const std::string& args_content) {
                     if (!first_item) value += ",";
                     first_item = false;
 
-                    if (arr_content.compare(arr_pos, 8, "<escape>") == 0) {
-                        arr_pos += 8;
-                        size_t end = arr_content.find("<escape>", arr_pos);
+                    size_t aq_len = match_quote_tag(arr_content, arr_pos);
+                    if (aq_len > 0) {
+                        arr_pos += aq_len;
+                        size_t end = find_quote_tag(arr_content, arr_pos);
                         if (end != std::string::npos) {
                             value += "\"" + arr_content.substr(arr_pos, end - arr_pos) + "\"";
-                            arr_pos = end + 8;
+                            arr_pos = end + match_quote_tag(arr_content, end);
                         }
                     } else {
                         size_t end = arr_content.find_first_of(",]", arr_pos);
@@ -499,8 +523,11 @@ inline std::string args_to_json(const std::string& args_content) {
 }
 
 inline void parse_function_calls(std::string& response, std::vector<std::string>& function_calls) {
-    const std::string CALL_START = "<start_function_call>";
-    const std::string CALL_END = "<end_function_call>";
+
+    const std::string CALL_START = (response.find("<|tool_call>") != std::string::npos)
+        ? "<|tool_call>" : "<start_function_call>";
+    const std::string CALL_END = (CALL_START == "<|tool_call>")
+        ? "<tool_call|>" : "<end_function_call>";
     size_t pos = 0;
 
     while ((pos = response.find(CALL_START, pos)) != std::string::npos) {
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
index 01b7b2f..020412a 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
@@ -122,13 +122,14 @@ enum class Activation {
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
+    ABS, POW, FLATTEN, VIEW,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
     RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
     SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
     RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
-    SAMPLE, CONCAT,
+    SAMPLE, CONCAT, CAT,
     SCATTER_TOPK,
     TOPK, LAYERNORM, GROUPNORM,
     MOE_LAYER,
@@ -136,7 +137,17 @@ enum class OpType {
     PERSISTENT,
     QUANTIZE_ACTIVATIONS,
     LSTM_CELL,
-    STFT
+    GATED_DELTANET_DECODE,
+    GATED_DELTANET_PREFILL,
+    STFT,
+    ALTUP_PREDICT,
+    ALTUP_CORRECT,
+    GAUSSIAN_TOPK,
+    MAXPOOL1D,
+    BILSTM_SEQUENCE,
+    LEAKY_RELU,
+    CONV2D_K3S1P1,
+    STATS_POOL
 };
 
 struct PrecisionTraits {
@@ -315,6 +326,7 @@ struct OpParams {
     size_t window_size = 0;
     bool is_causal = true;  
     bool attention_mask_is_additive = false;
+    float logit_cap = 0.0f;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -350,6 +362,10 @@ struct OpParams {
     size_t num_kv_heads = 0;
     size_t head_dim = 0;
     size_t num_fft_bins = 0;
+    size_t chunk_size = 0;
+    size_t num_altup_inputs = 0;
+    size_t v_head_dim = 0;
+    size_t kernel_size = 0;
 };
 
 struct GraphNode {
@@ -362,6 +378,28 @@ struct GraphNode {
     GraphNode(size_t node_id, OpType type);
 };
 
+using nodes_vector = std::vector<std::unique_ptr<GraphNode>>;
+using node_index_map_t = std::unordered_map<size_t, size_t>;
+
+inline const BufferDesc& get_input(const GraphNode& node, size_t idx,
+                                   const nodes_vector& nodes,
+                                   const node_index_map_t& node_index_map) {
+    return nodes[node_index_map.at(node.input_ids[idx])]->output_buffer;
+}
+
+struct AxisDims {
+    size_t outer, axis_size, inner;
+    static AxisDims from_shape(const std::vector<size_t>& shape, size_t axis) {
+        AxisDims d;
+        d.outer = 1;
+        for (size_t i = 0; i < axis; i++) d.outer *= shape[i];
+        d.axis_size = shape[axis];
+        d.inner = 1;
+        for (size_t i = axis + 1; i < shape.size(); i++) d.inner *= shape[i];
+        return d;
+    }
+};
+
 template<typename T>
 void dispatch_binary_op(OpType op, const T* lhs, const T* rhs, T* output, size_t count);
 
@@ -383,6 +421,14 @@ void compute_groupnorm_node(GraphNode& node, const std::vector<std::unique_ptr<G
 void compute_persistent_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_decode_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_prefill_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_predict_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_correct_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_maxpool1d_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_bilstm_sequence_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_conv2d_k3s1p1_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_stats_pool_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 
 void shrink_thread_local_buffers();
 class BufferPool {
@@ -437,7 +483,6 @@ class CactusGraph {
     size_t multiply(size_t input1, size_t input2);
     size_t divide(size_t input1, size_t input2);
     
-    
     size_t scalar_add(size_t input, float value);
     size_t scalar_subtract(size_t input, float value);
     size_t scalar_multiply(size_t input, float value);
@@ -455,6 +500,11 @@ class CactusGraph {
     size_t sigmoid(size_t input);
     size_t tanh(size_t input);
     size_t glu(size_t input, int axis = -1);
+
+    size_t abs(size_t input);
+    size_t pow(size_t input, float exponent);
+    size_t view(size_t input, const std::vector<size_t>& new_shape);
+    size_t flatten(size_t input, int start_dim = 0, int end_dim = -1);
     
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -497,7 +547,9 @@ class CactusGraph {
                      size_t num_experts_per_tok,
                      bool normalize_routing,
                      float epsilon,
-                     float routed_scaling_factor);
+                     float routed_scaling_factor,
+                     Activation activation = Activation::SILU,
+                     size_t per_expert_scale = 0);
     size_t moe_layer(size_t hidden,
                      size_t routing_probs,
                      size_t topk_indices,
@@ -518,13 +570,15 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
                             bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
-                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0,
+                            float logit_cap = 0.0f);
     size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
 
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
                                  const float* k_scales, const float* v_scales,
-                                 size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size = 0);
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim,
+                                 size_t window_size = 0, size_t v_head_dim = 0);
 
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
@@ -543,12 +597,30 @@ class CactusGraph {
     size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
 
     size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
+    size_t gated_deltanet_decode(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                 size_t initial_state, float scale = 0.0f);
+    size_t gated_deltanet_prefill(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                  size_t initial_state, size_t chunk_size = 64, float scale = 0.0f);
     size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
 
+    size_t altup_predict(size_t coefs, const size_t* streams, size_t num_streams);
+    size_t altup_correct(size_t coefs, size_t innovation, const size_t* predictions, size_t num_predictions);
+
+    size_t gaussian_topk(size_t input, float ppf);
+
+    size_t maxpool1d(size_t input, size_t kernel_size, size_t stride);
+    size_t leaky_relu(size_t input, float negative_slope = 0.01f);
+    size_t bilstm_sequence(size_t input, size_t w_ih_fwd, size_t w_hh_fwd, size_t b_ih_fwd, size_t b_hh_fwd,
+                           size_t w_ih_bwd, size_t w_hh_bwd, size_t b_ih_bwd, size_t b_hh_bwd);
+    size_t conv2d_k3s1p1(size_t input, size_t weight);
+    size_t conv2d_k3s1p1(size_t input, size_t weight, size_t bias);
+    size_t stats_pool(size_t input);
+
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
     
     size_t concat(size_t input1, size_t input2, int axis = 0);
+    size_t cat(const std::vector<size_t>& inputs, int axis);
     size_t scatter_topk(size_t indices, size_t values, size_t num_classes);
     
     void set_input(size_t node_id, const void* data, Precision precision);
@@ -653,4 +725,4 @@ namespace GraphFile {
     };
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
index 0ec7265..77f950a 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
@@ -11,7 +11,9 @@ enum class ScalarOpType {
     SUBTRACT,
     MULTIPLY,
     DIVIDE,
+    ABS,
     EXP,
+    POW,
     SQRT,
     COS,
     SIN,
@@ -54,6 +56,14 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
                         const int8_t* B, const __fp16* B_scales,
                         __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
+void cactus_gemv_int8_i8mm(const int8_t* A, float A_scale,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t K, size_t N, size_t group_size);
+
+void cactus_gemm_int8_i8mm(const int8_t* A, const float* A_scales,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
 void cactus_gemv_int4(const int8_t* A, float A_scale,
                       const int8_t* B_packed, const __fp16* B_scales,
                       __fp16* C, size_t K, size_t N, size_t group_size);
@@ -97,6 +107,9 @@ void cactus_max_axis_f16(const __fp16* input, __fp16* output, size_t outer_size,
 void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* output,
                           size_t batch_size, size_t dims, float eps);
 
+void cactus_layer_norm_f16(const __fp16* input, const __fp16* weight, const __fp16* bias,
+                            __fp16* output, size_t batch_size, size_t dims, float eps);
+
 void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
                       size_t num_heads, size_t head_dim, size_t start_pos, float theta);
 
@@ -108,6 +121,8 @@ void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
 
 void cactus_relu_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
+void cactus_leaky_relu_f16(const __fp16* input, __fp16* output, size_t num_elements, float negative_slope);
+
 void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
 void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
@@ -163,21 +178,54 @@ void cactus_batchnorm_f32(
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
+                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false,
+                          size_t v_head_dim = 0, float logit_cap = 0.0f);
 
 void cactus_attention_hybrid_int8_fp16(
-    const __fp16* queries,        
-    const int8_t* keys_cached, 
-    const int8_t* values_cached, 
+    const __fp16* queries,
+    const int8_t* keys_cached,
+    const int8_t* values_cached,
     const float* k_scales,
-    const float* v_scales, 
-    const __fp16* keys_new, 
-    const __fp16* values_new, 
+    const float* v_scales,
+    const __fp16* keys_new,
+    const __fp16* values_new,
     __fp16* output,
     size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
     size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
     float scale, size_t position_offset = 0, bool is_causal = true, size_t window_size = 0,
-    size_t group_size = KV_QUANT_GROUP_SIZE);
+    size_t group_size = KV_QUANT_GROUP_SIZE, size_t v_head_dim = 0);
+
+void cactus_gated_deltanet_decode_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    float scale);
+
+void cactus_gated_deltanet_prefill_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t T,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    size_t requested_chunk_size,
+    float scale);
 
 void cactus_conv1d_causal_depthwise_f16(
     const __fp16* input,
@@ -244,6 +292,18 @@ void cactus_conv1d_same_depthwise_f16_k9(
     size_t C
 );
 
+void cactus_conv2d_f16_k3s1p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
 void cactus_conv2d_f16_k3s2p1_nchw(
     const __fp16* input,
     const __fp16* weight,
@@ -305,6 +365,8 @@ void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size
 void cactus_concat_f16(const __fp16* input1, const __fp16* input2, __fp16* output,
                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
                        size_t ndims, int axis);
+void cactus_cat_f16(const __fp16** inputs, __fp16* output, const size_t** input_shapes,
+                      const size_t* output_shape, size_t num_inputs, size_t rank, int axis);
 
 void cactus_int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
 void cactus_fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
@@ -328,6 +390,30 @@ inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim,
 
 void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
 
+void cactus_gaussian_topk_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t rows,
+    size_t cols,
+    float ppf);
+
+void cactus_altup_predict_f16(
+    const __fp16* coefs,
+    const __fp16* const* streams,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
+
+void cactus_altup_correct_f16(
+    const __fp16* coefs,
+    const __fp16* innovation,
+    const __fp16* const* predictions,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
+
 void cactus_lstm_cell_f16(
     const __fp16* x_input,
     const __fp16* h_prev,
@@ -343,4 +429,31 @@ void cactus_lstm_cell_f16(
     size_t hidden_size
 );
 
-#endif
\ No newline at end of file
+void cactus_bilstm_sequence_f16(
+    const __fp16* input,
+    const __fp16* weight_ih_fwd,
+    const __fp16* weight_hh_fwd,
+    const __fp16* bias_ih_fwd,
+    const __fp16* bias_hh_fwd,
+    const __fp16* weight_ih_bwd,
+    const __fp16* weight_hh_bwd,
+    const __fp16* bias_ih_bwd,
+    const __fp16* bias_hh_bwd,
+    __fp16* output,
+    size_t batch_size,
+    size_t seq_len,
+    size_t input_size,
+    size_t hidden_size
+);
+
+void cactus_maxpool1d_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t batch_size,
+    size_t channels,
+    size_t input_length,
+    size_t kernel_size,
+    size_t stride
+);
+
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
index 118c85c..2d7a7a1 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
@@ -9,6 +9,8 @@
 #if defined(__ANDROID__)
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
+#include <sched.h>
+#include <fstream>
 #endif
 #include <algorithm>
 #include <cmath>
@@ -44,6 +46,29 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
 #endif
 }
 
+inline bool cpu_has_i8mm() {
+#if defined(__aarch64__)
+    static std::once_flag once;
+    static bool has = false;
+
+    std::call_once(once, []() {
+#if defined(__APPLE__)
+    has = true;
+#elif defined(__ANDROID__)
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    #ifndef HWCAP2_I8MM
+    #define HWCAP2_I8MM (1 << 13)
+    #endif
+    has = (hwcap2 & HWCAP2_I8MM) != 0;
+#endif
+    });
+
+    return has;
+#else
+    return false;
+#endif
+}
+
 inline bool cpu_has_sme2() {
 #if defined(__aarch64__)
 	static std::once_flag once;
@@ -130,6 +155,33 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
     return result;
 }
 
+constexpr size_t SIMD_F16_WIDTH = 8;
+
+inline size_t simd_align(size_t count, size_t width = SIMD_F16_WIDTH) {
+    return (count / width) * width;
+}
+
+inline void f16x8_split_f32(float16x8_t v, float32x4_t& lo, float32x4_t& hi) {
+    lo = vcvt_f32_f16(vget_low_f16(v));
+    hi = vcvt_f32_f16(vget_high_f16(v));
+}
+
+inline float16x8_t f32_merge_f16(float32x4_t lo, float32x4_t hi) {
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));
+}
+
+inline float32x4_t fast_sigmoid_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    return vdivq_f32(one, vaddq_f32(one, fast_exp_f32x4(vnegq_f32(x))));
+}
+
+template<typename F32x4Op>
+inline float16x8_t apply_f32_op_on_f16x8(float16x8_t v, F32x4Op op) {
+    float32x4_t lo, hi;
+    f16x8_split_f32(v, lo, hi);
+    return f32_merge_f16(op(lo), op(hi));
+}
+
 inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
     int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
     high_decoded = vshrq_n_s8(packed, 4);
@@ -138,6 +190,80 @@ inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded
 
 namespace CactusThreading {
 
+#if defined(__ANDROID__)
+    struct CoreTopology {
+        std::vector<int> performance_cores;  
+        std::vector<int> all_cores;
+
+        static CoreTopology& get() {
+            static CoreTopology topo = detect();
+            return topo;
+        }
+
+    private:
+        static int read_sysfs_int(const char* path) {
+            std::ifstream f(path);
+            if (!f.is_open()) return -1;
+            int val = -1;
+            f >> val;
+            return val;
+        }
+
+        static CoreTopology detect() {
+            CoreTopology topo;
+            constexpr int MAX_CPUS = 16;
+            std::vector<std::pair<int, int>> core_caps; 
+
+            for (int i = 0; i < MAX_CPUS; ++i) {
+                char path[128];
+
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpu_capacity", i);
+                int cap = read_sysfs_int(path);
+                if (cap > 0) {
+                    core_caps.push_back({i, cap});
+                    topo.all_cores.push_back(i);
+                    continue;
+                }
+
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", i);
+                int freq = read_sysfs_int(path);
+                if (freq > 0) {
+                    core_caps.push_back({i, freq});
+                    topo.all_cores.push_back(i);
+                }
+            }
+
+            if (core_caps.empty()) return topo;
+
+            int max_cap = 0;
+            for (auto& [id, cap] : core_caps) {
+                max_cap = std::max(max_cap, cap);
+            }
+
+            int threshold = static_cast<int>(max_cap * 0.70);
+            for (auto& [id, cap] : core_caps) {
+                if (cap >= threshold) {
+                    topo.performance_cores.push_back(id);
+                }
+            }
+
+            return topo;
+        }
+    };
+
+    inline bool pin_current_thread_to_cores(const std::vector<int>& cores) {
+        if (cores.empty()) return false;
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+        for (int core : cores) {
+            CPU_SET(core, &mask);
+        }
+        return sched_setaffinity(0, sizeof(mask), &mask) == 0;
+    }
+#endif
+
     class ThreadPool {
     private:
         static constexpr size_t MAX_WORKERS = 16;
@@ -184,9 +310,25 @@ namespace CactusThreading {
             : stop(false), pending_tasks(0) {
             num_workers_ = std::min(num_threads, MAX_WORKERS);
             if (num_workers_ == 0) num_workers_ = 1;
+
+#if defined(__ANDROID__)
+            auto& topo = CoreTopology::get();
+            if (!topo.performance_cores.empty()) {
+                num_workers_ = std::min(num_workers_, topo.performance_cores.size());
+            }
+#endif
+
             workers.reserve(num_workers_);
             for (size_t i = 0; i < num_workers_; ++i) {
-                workers.emplace_back(&ThreadPool::worker_thread, this);
+                workers.emplace_back([this]() {
+#if defined(__ANDROID__)
+                    auto& perf = CoreTopology::get().performance_cores;
+                    if (!perf.empty()) {
+                        pin_current_thread_to_cores(perf);
+                    }
+#endif
+                    worker_thread();
+                });
             }
         }
 
@@ -498,5 +640,52 @@ namespace CactusThreading {
 
 }
 
+template<typename SimdOp, typename ScalarOp>
+void elementwise_op_f16(const __fp16* input, __fp16* output, size_t num_elements,
+                        bool use_streaming, CactusThreading::ParallelConfig config,
+                        SimdOp simd_op, ScalarOp scalar_op, size_t unroll = 4) {
+    CactusThreading::parallel_for(num_elements, config,
+        [&](size_t start, size_t end) {
+            const size_t n = end - start;
+            const size_t vec_end = start + simd_align(n);
+
+            if (use_streaming && unroll >= 4) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 4);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 4) {
+                    __builtin_prefetch(&input[i + 256], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    float16x8_t v2 = simd_op(vld1q_f16(&input[i + 16]));
+                    float16x8_t v3 = simd_op(vld1q_f16(&input[i + 24]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                    stream_store_f16x8(&output[i + 16], v2);
+                    stream_store_f16x8(&output[i + 24], v3);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else if (use_streaming && unroll >= 2) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 2);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 2) {
+                    __builtin_prefetch(&input[i + 128], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else {
+                for (size_t i = start; i < vec_end; i += SIMD_F16_WIDTH) {
+                    vst1q_f16(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            }
+            for (size_t i = vec_end; i < end; ++i) {
+                output[i] = scalar_op(input[i]);
+            }
+        });
+}
 
-#endif // KERNEL_UTILS_H 
+#endif // KERNEL_UTILS_H
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus
index b1ee86e..59bded3 100755
Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus differ
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
index aa72986..6e35847 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
@@ -41,7 +41,20 @@ CACTUS_FFI_EXPORT int cactus_complete(
     const char* options_json,               // optional
     const char* tools_json,                 // optional
     cactus_token_callback callback,         // optional
-    void* user_data                         // optional
+    void* user_data,                        // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
+);
+
+CACTUS_FFI_EXPORT int cactus_prefill(
+    cactus_model_t model,
+    const char* messages_json,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const char* tools_json,                 // optional
+    const uint8_t* pcm_buffer,             // optional: NULL when not used
+    size_t pcm_buffer_size                 // optional: 0 when not used
 );
 
 CACTUS_FFI_EXPORT int cactus_tokenize(
@@ -140,6 +153,26 @@ CACTUS_FFI_EXPORT int cactus_vad(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_diarize(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_embed_speaker(
+    cactus_model_t model,
+    const char* audio_file_path,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT int cactus_rag_query(
     cactus_model_t model,
     const char* query,
@@ -148,7 +181,6 @@ CACTUS_FFI_EXPORT int cactus_rag_query(
     size_t top_k
 );
 
-
 CACTUS_FFI_EXPORT cactus_index_t cactus_index_init(
     const char* index_dir,
     size_t embedding_dim
@@ -199,11 +231,217 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
+// level: 0=DEBUG, 1=INFO, 2=WARN (default), 3=ERROR, 4=NONE
+CACTUS_FFI_EXPORT void cactus_log_set_level(int level);
+
+typedef void (*cactus_log_callback_t)(int level, const char* component, const char* message, void* user_data);
+CACTUS_FFI_EXPORT void cactus_log_set_callback(cactus_log_callback_t callback, void* user_data);
+
 CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
 CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
 CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
 CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
+// cactus graph export
+typedef void* cactus_graph_t;
+typedef uint64_t cactus_node_t;
+
+typedef struct {
+    int32_t precision;
+    size_t rank;
+    size_t shape[8]; 
+    size_t num_elements;
+    size_t byte_size;
+} cactus_tensor_info_t;
+
+CACTUS_FFI_EXPORT cactus_graph_t cactus_graph_create(void);
+CACTUS_FFI_EXPORT void cactus_graph_destroy(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_hard_reset(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_input(
+    cactus_graph_t graph, const size_t* shape, size_t rank, int32_t precision,
+cactus_node_t* out_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_set_input(
+    cactus_graph_t graph, cactus_node_t node, const void* data, int32_t
+precision);
+CACTUS_FFI_EXPORT int cactus_graph_set_external_input(
+    cactus_graph_t graph, cactus_node_t node, void* data, int32_t precision);
+
+CACTUS_FFI_EXPORT int cactus_graph_precision_cast(
+    cactus_graph_t graph, cactus_node_t input, int32_t target_precision, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_quantize_activations(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_add(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_add_clipped(cactus_graph_t graph, cactus_node_t a,
+cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_subtract(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_multiply(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_divide(cactus_graph_t graph, cactus_node_t
+a, cactus_node_t b, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_scalar_add(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_subtract(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_multiply(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_divide(cactus_graph_t graph, cactus_node_t x, float value, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_exp(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sqrt(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_cos(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_sin(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scalar_log(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_abs(cactus_graph_t graph, cactus_node_t x,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_pow(cactus_graph_t graph, cactus_node_t x,
+float exponent, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_view(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_flatten(
+    cactus_graph_t graph, cactus_node_t x, int32_t start_dim, int32_t end_dim,
+cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_reshape(
+    cactus_graph_t graph, cactus_node_t x, const size_t* shape, size_t rank, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose(
+    cactus_graph_t graph, cactus_node_t x, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_transpose_n(
+    cactus_graph_t graph, cactus_node_t x, const size_t* permutation, size_t rank, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_slice(
+    cactus_graph_t graph, cactus_node_t x, int32_t axis, size_t start, size_t length, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_index(
+    cactus_graph_t graph, cactus_node_t x, size_t index_value, int32_t dim, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sum(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mean(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_variance(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_min(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_max(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_concat(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, int32_t axis,
+cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_cat(
+    cactus_graph_t graph, const cactus_node_t* nodes, size_t count, int32_t
+axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_matmul(
+    cactus_graph_t graph, cactus_node_t a, cactus_node_t b, bool pretransposed_rhs, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gather(
+    cactus_graph_t graph, cactus_node_t tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_tensor(
+    cactus_graph_t graph, cactus_node_t embedding_tensor, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_embedding_from_file(
+    cactus_graph_t graph, const char* filename, cactus_node_t indices, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_embeddings(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_mmap_weights(
+    cactus_graph_t graph, const char* filename, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_bilinear_interpolation(
+    cactus_graph_t graph, cactus_node_t pos_embeds, size_t dst_height, size_t dst_width, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_set_grouped_scales(
+    cactus_graph_t graph, cactus_node_t node, size_t group_size, size_t num_groups, void* scales_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_set_interleaved(
+    cactus_graph_t graph, cactus_node_t node, bool interleaved, size_t original_n);
+CACTUS_FFI_EXPORT int cactus_graph_release_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_prefetch_weight_pages(cactus_graph_t graph, cactus_node_t node);
+CACTUS_FFI_EXPORT int cactus_graph_release_all_weight_pages(cactus_graph_t graph);
+
+CACTUS_FFI_EXPORT int cactus_graph_relu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_silu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gelu_erf(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_sigmoid(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_tanh(cactus_graph_t graph, cactus_node_t x, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_glu(cactus_graph_t graph, cactus_node_t x, int32_t axis, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_layernorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, float epsilon, bool has_bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_groupnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, size_t num_groups, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_batchnorm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t running_mean, cactus_node_t running_var, int32_t axis, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_topk(cactus_graph_t graph, cactus_node_t input, size_t k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rms_norm(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, float epsilon, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rope_gptj(
+    cactus_graph_t graph, cactus_node_t input, float theta, size_t position_offset, size_t rot_dim, int32_t backend, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_softmax(cactus_graph_t graph, cactus_node_t input, int32_t axis, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, float scale, bool is_causal, size_t position_offset, size_t window_size, int32_t backend, bool use_mask, cactus_node_t mask, bool additive_mask, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_rel_pos_bias(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t relative_key, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_attention_int8_hybrid(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key_new, cactus_node_t value_new, float scale, size_t position_offset,
+    const int8_t* cached_keys, const int8_t* cached_values, const float* k_scales, const float* v_scales,
+    size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_causal(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t kernel_size, size_t dilation, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_k7s3(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, size_t stride, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_same_depthwise_k9(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv1d_pointwise(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_depthwise_k3s2p1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_conv2d_pointwise_1x1(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, bool has_bias, cactus_node_t bias, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_lstm_cell(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t h_prev, cactus_node_t c_prev, cactus_node_t weight_ih, cactus_node_t weight_hh, cactus_node_t bias_ih, cactus_node_t bias_hh, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_decode(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gated_deltanet_prefill(
+    cactus_graph_t graph, cactus_node_t query, cactus_node_t key, cactus_node_t value, cactus_node_t gate_log, cactus_node_t beta, cactus_node_t initial_state, size_t chunk_size, float scale, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_stft(
+    cactus_graph_t graph, cactus_node_t input, cactus_node_t weight, size_t stride, size_t num_fft_bins, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_altup_predict(
+    cactus_graph_t graph, cactus_node_t coefs, const cactus_node_t* streams, size_t num_streams, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_altup_correct(
+    cactus_graph_t graph, cactus_node_t coefs, cactus_node_t innovation, const cactus_node_t* predictions, size_t num_predictions, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_gaussian_topk(
+    cactus_graph_t graph, cactus_node_t input, float ppf, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_gated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w3_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_moe_layer_ungated(
+    cactus_graph_t graph, cactus_node_t hidden, cactus_node_t routing_probs, cactus_node_t topk_indices,
+    const cactus_node_t* w1_weights, const cactus_node_t* w2_weights,
+    size_t num_experts, size_t num_experts_per_tok, bool normalize_routing, float epsilon, float routed_scaling_factor, int32_t activation, cactus_node_t* out);
+
+CACTUS_FFI_EXPORT int cactus_graph_sample(
+    cactus_graph_t graph, cactus_node_t logits, float temperature, float top_p, size_t top_k, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_scatter_topk(
+    cactus_graph_t graph, cactus_node_t indices, cactus_node_t values, size_t num_classes, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_persistent(
+    cactus_graph_t graph, cactus_node_t source_node, cactus_node_t* out);
+CACTUS_FFI_EXPORT int cactus_graph_is_populated(
+    cactus_graph_t graph, cactus_node_t persistent_node, int32_t* out_is_populated);
+CACTUS_FFI_EXPORT int cactus_graph_invalidate_persistent(
+    cactus_graph_t graph, cactus_node_t persistent_node);
+
+CACTUS_FFI_EXPORT int cactus_graph_execute(cactus_graph_t graph);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_ptr(cactus_graph_t graph,
+cactus_node_t node, void** out_ptr);
+CACTUS_FFI_EXPORT int cactus_graph_get_output_info(cactus_graph_t graph,
+cactus_node_t node, cactus_tensor_info_t* out_info);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
index 3b5d97f..6570f09 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <map>
 #include <stdexcept>
 #include <sstream>
 #include <iomanip>
@@ -63,6 +64,16 @@ struct CactusModelHandle {
     std::unique_ptr<cactus::engine::Model> vad_model;
     std::atomic<bool> should_stop;
     std::vector<uint32_t> processed_tokens;
+    struct ProcessedImage {
+        std::string path;
+        long long last_modified_timestamp = 0;
+
+        bool operator==(const ProcessedImage& other) const {
+            return path == other.path && last_modified_timestamp == other.last_modified_timestamp;
+        }
+    };
+
+    std::vector<std::vector<ProcessedImage>> processed_images;
     std::mutex model_mutex;
     std::string model_name;
     std::unique_ptr<cactus::engine::index::Index> corpus_index;
@@ -124,6 +135,66 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogra
     return cfg;
 }
 
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_htk_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft        = 321;
+    cfg.frame_length = 320;
+    cfg.fft_override = 1024;
+    cfg.hop_length   = 160;
+    cfg.power        = 1.0f;
+    cfg.center       = false;
+    cfg.pad_mode     = "constant";
+    cfg.onesided     = true;
+    cfg.dither       = 0.0f;
+    cfg.mel_floor    = 0.001f;
+    cfg.log_mel      = "log";
+    cfg.reference    = 1.0f;
+    cfg.min_value    = 0.001f;
+    cfg.remove_dc_offset = false;
+    cfg.hann_periodic = true;
+    return cfg;
+}
+
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_gemma4_audio_spectrogram_config(
+    const cactus::engine::Config& model_config) {
+    auto cfg = get_htk_spectrogram_config();
+    cfg.fft_override = model_config.audio_fft_length;
+    cfg.mel_floor_additive = true;
+    return cfg;
+}
+
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_wespeaker_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft            = 512;
+    cfg.frame_length     = 400;
+    cfg.hop_length       = 160;
+    cfg.power            = 2.0f;
+    cfg.center           = false;
+    cfg.pad_mode         = "constant";
+    cfg.onesided         = true;
+    cfg.dither           = 0.0f;
+    cfg.mel_floor        = 1.1754944e-38f;
+    cfg.log_mel          = "log";
+    cfg.reference        = 1.0f;
+    cfg.min_value        = 1.1754944e-38f;
+    cfg.remove_dc_offset = true;
+    cfg.preemphasis      = 0.97f;
+    cfg.hann_periodic    = false;
+    cfg.window_a0        = 0.54f;
+    return cfg;
+}
+
+inline std::vector<float> transpose_mel_to_frame_major(const std::vector<float>& mel,
+                                                        size_t num_mels, size_t num_frames) {
+    std::vector<float> transposed(num_frames * num_mels);
+    for (size_t m = 0; m < num_mels; m++) {
+        for (size_t t = 0; t < num_frames; t++) {
+            transposed[t * num_mels + m] = mel[m * num_frames + t];
+        }
+    }
+    return transposed;
+}
+
 inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
     if (waveform.size() < 2 || coefficient == 0.0f) {
         return;
@@ -180,6 +251,56 @@ inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t val
     mel.swap(trimmed);
 }
 
+struct AudioPreprocessResult {
+    std::vector<float> features;
+    size_t num_frames = 0;
+    size_t num_soft_tokens = 0;
+};
+
+inline AudioPreprocessResult preprocess_audio_for_gemma4(
+    std::vector<float> audio_samples,
+    const cactus::engine::Config& model_config
+) {
+    AudioPreprocessResult result;
+    if (audio_samples.empty()) return result;
+
+    size_t pad_amt = 320 - (audio_samples.size() % 320);
+    if (pad_amt < 320)
+        audio_samples.resize(audio_samples.size() + pad_amt, 0.0f);
+
+    size_t mel_bins = model_config.audio_input_feat_size;
+    auto cfg = get_gemma4_audio_spectrogram_config(model_config);
+
+    size_t semicausal_pad = cfg.frame_length / 2;
+    audio_samples.insert(audio_samples.begin(), semicausal_pad, 0.0f);
+
+    cactus::engine::AudioProcessor ap;
+    size_t fft_for_mel = cfg.fft_override > 0 ? cfg.fft_override : cfg.n_fft;
+    ap.init_mel_filters(fft_for_mel / 2 + 1, mel_bins, 0.0f, 8000.0f, 16000,
+                        nullptr, "htk");
+    std::vector<float> mel = ap.compute_spectrogram(audio_samples, cfg);
+
+    result.num_frames = mel.size() / mel_bins;
+    result.features = transpose_mel_to_frame_major(mel, mel_bins, result.num_frames);
+
+    size_t after_stage1 = (result.num_frames + 1) / 2;
+    result.num_soft_tokens = (after_stage1 + 1) / 2;
+
+    return result;
+}
+
+inline std::vector<float> pcm_buffer_to_float_samples(
+    const uint8_t* pcm_buffer, size_t pcm_buffer_size
+) {
+    const int16_t* pcm_samples = reinterpret_cast<const int16_t*>(pcm_buffer);
+    size_t num_samples = pcm_buffer_size / 2;
+    std::vector<float> waveform_fp32(num_samples);
+    constexpr float inv_32768 = 1.0f / 32768.0f;
+    for (size_t i = 0; i < num_samples; i++)
+        waveform_fp32[i] = static_cast<float>(pcm_samples[i]) * inv_32768;
+    return waveform_fp32;
+}
+
 } // namespace audio
 } // namespace cactus
 
@@ -226,6 +347,24 @@ struct ToolFunction {
     std::unordered_map<std::string, std::string> parameters;
 };
 
+struct InferenceOptions {
+    float temperature = 0.0f;
+    float top_p = 0.0f;
+    float confidence_threshold = 0.7f;
+    size_t top_k = 0;
+    size_t max_tokens = 100;
+    size_t tool_rag_top_k = 2;
+    size_t cloud_timeout_ms = 15000;
+    std::vector<std::string> stop_sequences;
+    bool force_tools = false;
+    bool include_stop_sequences = false;
+    bool use_vad = true;
+    bool telemetry_enabled = true;
+    bool auto_handoff = true;
+    bool handoff_with_images = true;
+    bool enable_thinking_if_supported = true;
+};
+
 } // namespace ffi
 } // namespace cactus
 
@@ -262,6 +401,24 @@ inline std::string trim_string(const std::string& s) {
     return s.substr(start, end - start);
 }
 
+inline size_t find_matching_delimiter(const std::string& s, size_t pos, char open, char close) {
+    int depth = 1;
+    pos++;
+    while (pos < s.length() && depth > 0) {
+        if (s[pos] == open) depth++;
+        else if (s[pos] == close) depth--;
+        else if (s[pos] == '"') {
+            pos++;
+            while (pos < s.length() && s[pos] != '"') {
+                if (s[pos] == '\\') pos++;
+                pos++;
+            }
+        }
+        pos++;
+    }
+    return pos;
+}
+
 inline std::string env_or_default(const char* key, const char* fallback) {
     const char* v = std::getenv(key);
     if (v && v[0] != '\0') return std::string(v);
@@ -377,6 +534,119 @@ inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools)
     return oss.str();
 }
 
+namespace json_sorted {
+
+inline void skip_ws(const std::string& s, size_t& p) {
+    while (p < s.size() && std::isspace(static_cast<unsigned char>(s[p]))) p++;
+}
+
+inline std::string parse_string(const std::string& s, size_t& p) {
+    std::string r = "\"";
+    p++;
+    while (p < s.size()) {
+        if (s[p] == '\\') {
+            r += s[p++];
+            if (p < s.size()) r += s[p++];
+        } else if (s[p] == '"') {
+            r += '"';
+            p++;
+            return r;
+        } else {
+            r += s[p++];
+        }
+    }
+    return r;
+}
+
+inline std::string parse_value(const std::string& s, size_t& p);
+
+inline std::string parse_object(const std::string& s, size_t& p) {
+    p++;
+    std::map<std::string, std::string> entries;
+    skip_ws(s, p);
+    while (p < s.size() && s[p] != '}') {
+        if (s[p] == ',') { p++; skip_ws(s, p); continue; }
+        std::string key = parse_string(s, p);
+        skip_ws(s, p);
+        if (p < s.size() && s[p] == ':') p++;
+        skip_ws(s, p);
+        std::string val = parse_value(s, p);
+        entries[key] = val;
+        skip_ws(s, p);
+    }
+    if (p < s.size()) p++;
+    std::string r = "{";
+    bool first = true;
+    for (const auto& kv : entries) {
+        if (!first) r += ", ";
+        r += kv.first + ": " + kv.second;
+        first = false;
+    }
+    r += "}";
+    return r;
+}
+
+inline std::string parse_array(const std::string& s, size_t& p) {
+    p++;
+    std::vector<std::string> items;
+    skip_ws(s, p);
+    while (p < s.size() && s[p] != ']') {
+        if (s[p] == ',') { p++; skip_ws(s, p); continue; }
+        items.push_back(parse_value(s, p));
+        skip_ws(s, p);
+    }
+    if (p < s.size()) p++;
+    std::string r = "[";
+    for (size_t i = 0; i < items.size(); i++) {
+        if (i > 0) r += ", ";
+        r += items[i];
+    }
+    r += "]";
+    return r;
+}
+
+inline std::string parse_value(const std::string& s, size_t& p) {
+    skip_ws(s, p);
+    if (p >= s.size()) return "";
+    if (s[p] == '"') return parse_string(s, p);
+    if (s[p] == '{') return parse_object(s, p);
+    if (s[p] == '[') return parse_array(s, p);
+    size_t start = p;
+    while (p < s.size() && s[p] != ',' && s[p] != '}' && s[p] != ']' && !std::isspace(static_cast<unsigned char>(s[p]))) p++;
+    return s.substr(start, p - start);
+}
+
+inline std::string reformat(const std::string& json) {
+    size_t p = 0;
+    return parse_value(json, p);
+}
+
+} // namespace json_sorted
+
+inline std::string serialize_tools_for_template(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+    std::string result;
+    for (const auto& tool : tools) {
+        std::map<std::string, std::string> func_fields;
+        func_fields["\"description\""] = "\"" + escape_json_string(tool.description) + "\"";
+        func_fields["\"name\""] = "\"" + escape_json_string(tool.name) + "\"";
+        auto it = tool.parameters.find("schema");
+        if (it != tool.parameters.end()) {
+            func_fields["\"parameters\""] = json_sorted::reformat(it->second);
+        }
+        std::string func_json = "{";
+        bool first = true;
+        for (const auto& kv : func_fields) {
+            if (!first) func_json += ", ";
+            func_json += kv.first + ": " + kv.second;
+            first = false;
+        }
+        func_json += "}";
+        result += "\n{\"function\": " + func_json + ", \"type\": \"function\"}";
+    }
+    return result;
+}
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::ostringstream json;
     json << "{";
@@ -401,10 +671,12 @@ inline void handle_error_response(const std::string& error_message, char* respon
     }
 }
 
-inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json, 
-                                                                   std::vector<std::string>& out_image_paths) {
+inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
+                                                                   std::vector<std::string>& out_image_paths,
+                                                                   std::vector<std::string>* out_audio_paths = nullptr) {
     std::vector<cactus::engine::ChatMessage> messages;
     out_image_paths.clear();
+    if (out_audio_paths) out_audio_paths->clear();
     
     size_t pos = json.find('[');
     if (pos == std::string::npos) {
@@ -457,39 +729,111 @@ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::s
             }
         }
         
-        size_t images_pos = json.find("\"images\"", pos);
-        if (images_pos != std::string::npos && images_pos < obj_end) {
-            size_t array_start = json.find('[', images_pos);
-            if (array_start != std::string::npos && array_start < obj_end) {
-                size_t array_end = json.find(']', array_start);
-                if (array_end != std::string::npos && array_end < obj_end) {
-                    size_t img_pos = array_start;
-                    while (true) {
-                        img_pos = json.find('"', img_pos + 1);
-                        if (img_pos == std::string::npos || img_pos >= array_end) break;
-                        
-                        size_t img_start = img_pos + 1;
-                        size_t img_end = json.find('"', img_start);
-                        if (img_end == std::string::npos || img_end > array_end) break;
-                        
-                        std::string img_path = json.substr(img_start, img_end - img_start);
-                        
-                        std::filesystem::path p(img_path);
-                        img_path = std::filesystem::absolute(p).string();
-                        
-                        msg.images.push_back(img_path);
-                        out_image_paths.push_back(img_path);
-                        img_pos = img_end;
+        auto parse_path_array = [&](const char* key, std::vector<std::string>& dest,
+                                    std::vector<std::string>* out_paths) {
+            size_t key_pos = json.find(key, pos);
+            if (key_pos == std::string::npos || key_pos >= obj_end) return;
+            size_t array_start = json.find('[', key_pos);
+            if (array_start == std::string::npos || array_start >= obj_end) return;
+            size_t array_end = json.find(']', array_start);
+            if (array_end == std::string::npos || array_end >= obj_end) return;
+            size_t cur = array_start;
+            while (true) {
+                cur = json.find('"', cur + 1);
+                if (cur == std::string::npos || cur >= array_end) break;
+                size_t str_start = cur + 1;
+                size_t str_end = json.find('"', str_start);
+                if (str_end == std::string::npos || str_end > array_end) break;
+                std::string path = std::filesystem::absolute(
+                    std::filesystem::path(json.substr(str_start, str_end - str_start))).string();
+                dest.push_back(path);
+                if (out_paths) out_paths->push_back(path);
+                cur = str_end;
+            }
+        };
+
+        parse_path_array("\"images\"", msg.images, &out_image_paths);
+        parse_path_array("\"audio\"", msg.audio, out_audio_paths);
+
+        if (msg.role == "tool") {
+            size_t name_pos = json.find("\"name\"", obj_start);
+            if (name_pos != std::string::npos && name_pos < obj_end) {
+                size_t name_quote = json.find('"', name_pos + 6);
+                if (name_quote != std::string::npos && name_quote < obj_end) {
+                    size_t name_start = name_quote + 1;
+                    size_t name_end = json.find('"', name_start);
+                    if (name_end != std::string::npos && name_end < obj_end) {
+                        msg.name = json.substr(name_start, name_end - name_start);
                     }
                 }
             }
         }
-        
+
+        size_t tool_calls_pos = json.find("\"tool_calls\"", obj_start);
+        if (tool_calls_pos != std::string::npos && tool_calls_pos < obj_end) {
+            size_t tool_calls_arr_start = json.find('[', tool_calls_pos);
+            if (tool_calls_arr_start != std::string::npos && tool_calls_arr_start < obj_end) {
+                size_t tool_calls_arr_end = find_matching_delimiter(json, tool_calls_arr_start, '[', ']');
+
+                size_t search_pos = tool_calls_arr_start;
+                while (true) {
+                    size_t func_pos = json.find("\"function\"", search_pos);
+                    if (func_pos == std::string::npos || func_pos >= tool_calls_arr_end) break;
+
+                    size_t func_obj_start = json.find('{', func_pos + 10);
+                    if (func_obj_start == std::string::npos || func_obj_start >= tool_calls_arr_end) break;
+
+                    size_t func_obj_end = find_matching_delimiter(json, func_obj_start, '{', '}');
+
+                    cactus::engine::ToolCallInfo tool_call;
+
+                    size_t fn_name_pos = json.find("\"name\"", func_obj_start);
+                    if (fn_name_pos != std::string::npos && fn_name_pos < func_obj_end) {
+                        size_t fn_name_quote = json.find('"', fn_name_pos + 6);
+                        if (fn_name_quote != std::string::npos && fn_name_quote < func_obj_end) {
+                            size_t fn_name_start = fn_name_quote + 1;
+                            size_t fn_name_end = json.find('"', fn_name_start);
+                            if (fn_name_end != std::string::npos && fn_name_end < func_obj_end) {
+                                tool_call.name = json.substr(fn_name_start, fn_name_end - fn_name_start);
+                            }
+                        }
+                    }
+
+                    size_t args_pos = json.find("\"arguments\"", func_obj_start);
+                    if (args_pos != std::string::npos && args_pos < func_obj_end) {
+                        size_t colon_pos = json.find(':', args_pos + 11);
+                        if (colon_pos != std::string::npos && colon_pos < func_obj_end) {
+                            size_t args_start = colon_pos + 1;
+                            while (args_start < json.length() && std::isspace(static_cast<unsigned char>(json[args_start]))) args_start++;
+
+                            if (args_start < func_obj_end && json[args_start] == '{') {
+                                size_t args_end = find_matching_delimiter(json, args_start, '{', '}');
+                                tool_call.arguments = json.substr(args_start, args_end - args_start);
+                            } else if (args_start < func_obj_end && json[args_start] == '"') {
+                                size_t str_start = args_start + 1;
+                                size_t str_end = str_start;
+                                while (str_end < json.length() && json[str_end] != '"') {
+                                    if (json[str_end] == '\\') str_end++;
+                                    str_end++;
+                                }
+                                tool_call.arguments = json.substr(str_start, str_end - str_start);
+                            }
+                        }
+                    }
+
+                    if (!tool_call.name.empty()) {
+                        msg.tool_calls.push_back(tool_call);
+                    }
+                    search_pos = func_obj_end;
+                }
+            }
+        }
+
         messages.push_back(msg);
-        
+
         pos = json.find('{', obj_end);
     }
-    
+
     return messages;
 }
 
@@ -538,128 +882,433 @@ inline std::vector<ToolFunction> parse_tools_json(const std::string& json) {
         
         pos = json.find("\"function\"", name_pos);
     }
-    
+
     return tools;
 }
 
-inline void parse_options_json(const std::string& json,
-                               float& temperature, float& top_p,
-                               size_t& top_k, size_t& max_tokens,
-                               std::vector<std::string>& stop_sequences,
-                               bool& force_tools,
-                               size_t& tool_rag_top_k,
-                               float& confidence_threshold,
-                               bool& include_stop_sequences,
-                               bool& use_vad,
-                               bool& telemetry_enabled,
-                               bool* auto_handoff = nullptr,
-                               size_t* cloud_timeout_ms = nullptr,
-                               bool* handoff_with_images = nullptr) {
-    temperature = 0.0f;
-    top_p = 0.0f;
-    top_k = 0;
-    max_tokens = 100;
-    force_tools = false;
-    tool_rag_top_k = 2;
-    confidence_threshold = 0.7f;
-    include_stop_sequences = false;
-    use_vad = true;
-    telemetry_enabled = true;
-    if (auto_handoff) *auto_handoff = true;
-    if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
-    if (handoff_with_images) *handoff_with_images = true;
-    stop_sequences.clear();
+inline bool try_parse_json_float(const std::string& json, const std::string& key, float& out_value) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return false;
+
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+
+    size_t end = start;
+    while (end < json.size() && std::string(",}] \t\n\r").find(json[end]) == std::string::npos) ++end;
+
+    try {
+        out_value = std::stof(json.substr(start, end - start));
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+inline std::vector<std::string> parse_json_string_array_field(const std::string& json, const std::string& key) {
+    std::vector<std::string> out;
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return out;
+
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+    if (start >= json.size() || json[start] != '[') return out;
+
+    int depth = 1;
+    bool in_string = false;
+    bool escaped = false;
+    size_t end = start + 1;
+
+    while (end < json.size() && depth > 0) {
+        char c = json[end];
+        if (in_string) {
+            if (escaped) escaped = false;
+            else if (c == '\\') escaped = true;
+            else if (c == '"') in_string = false;
+        } else {
+            if (c == '"') in_string = true;
+            else if (c == '[') depth++;
+            else if (c == ']') depth--;
+        }
+        ++end;
+    }
+
+    if (depth != 0) return out;
+    const std::string array_json = json.substr(start, end - start);
+    if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
+
+    size_t i = 1;
+    while (i + 1 < array_json.size()) {
+        while (i + 1 < array_json.size() &&
+               (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) {
+            ++i;
+        }
+        if (i + 1 >= array_json.size() || array_json[i] == ']') break;
+        if (array_json[i] != '"') break;
+
+        ++i;
+        std::string value;
+        bool escaped = false;
+        while (i < array_json.size()) {
+            char c = array_json[i++];
+            if (escaped) {
+                switch (c) {
+                    case '"': value.push_back('"'); break;
+                    case '\\': value.push_back('\\'); break;
+                    case '/': value.push_back('/'); break;
+                    case 'b': value.push_back('\b'); break;
+                    case 'f': value.push_back('\f'); break;
+                    case 'n': value.push_back('\n'); break;
+                    case 'r': value.push_back('\r'); break;
+                    case 't': value.push_back('\t'); break;
+                    default: value.push_back(c); break;
+                }
+                escaped = false;
+                continue;
+            }
+            if (c == '\\') {
+                escaped = true;
+                continue;
+            }
+            if (c == '"') {
+                out.push_back(value);
+                break;
+            }
+            value.push_back(c);
+        }
+    }
 
+    return out;
+}
+
+inline void parse_custom_vocabulary_options(const std::string& json,
+                                            std::vector<std::string>& custom_vocabulary,
+                                            float& vocabulary_boost) {
+    custom_vocabulary.clear();
+    vocabulary_boost = 5.0f;
     if (json.empty()) return;
 
+    float parsed_boost = vocabulary_boost;
+    if (try_parse_json_float(json, "vocabulary_boost", parsed_boost)) {
+        vocabulary_boost = std::clamp(parsed_boost, 0.0f, 20.0f);
+    }
+
+    custom_vocabulary = parse_json_string_array_field(json, "custom_vocabulary");
+}
+
+inline std::unordered_map<uint32_t, float> build_token_bias_map(const std::vector<std::vector<uint32_t>>& tokenized_entries,
+                                                                float vocabulary_boost) {
+    std::unordered_map<uint32_t, float> vocab_bias;
+    const float clamped_boost = std::clamp(vocabulary_boost, 0.0f, 20.0f);
+    if (clamped_boost == 0.0f) return vocab_bias;
+
+    for (const auto& token_ids : tokenized_entries) {
+        for (uint32_t token_id : token_ids) {
+            float& entry = vocab_bias[token_id];
+            if (entry < clamped_boost) {
+                entry = clamped_boost;
+            }
+        }
+    }
+
+    return vocab_bias;
+}
+
+inline std::unordered_map<uint32_t, float> build_custom_vocabulary_bias(cactus::engine::Tokenizer* tokenizer,
+                                                                        const std::vector<std::string>& custom_vocabulary,
+                                                                        float vocabulary_boost) {
+    if (!tokenizer || custom_vocabulary.empty()) return {};
+    std::vector<std::vector<uint32_t>> tokenized_entries;
+    tokenized_entries.reserve(custom_vocabulary.size());
+
+    for (const auto& word : custom_vocabulary) {
+        if (word.empty()) continue;
+        tokenized_entries.push_back(tokenizer->encode(word));
+    }
+
+    return build_token_bias_map(tokenized_entries, vocabulary_boost);
+}
+
+inline void apply_custom_vocabulary_options(cactus::engine::Model* model, const std::string& json) {
+    if (!model) return;
+
+    std::vector<std::string> custom_vocabulary;
+    float vocabulary_boost = 5.0f;
+    parse_custom_vocabulary_options(json, custom_vocabulary, vocabulary_boost);
+    model->set_vocab_bias(build_custom_vocabulary_bias(model->get_tokenizer(), custom_vocabulary, vocabulary_boost));
+}
+
+inline size_t levenshtein_ci(const std::string& a, const std::string& b) {
+    const size_t m = a.size(), n = b.size();
+    std::vector<size_t> prev(n + 1), curr(n + 1);
+    for (size_t j = 0; j <= n; ++j) prev[j] = j;
+    for (size_t i = 1; i <= m; ++i) {
+        curr[0] = i;
+        for (size_t j = 1; j <= n; ++j) {
+            const bool match = std::tolower(static_cast<unsigned char>(a[i - 1])) ==
+                               std::tolower(static_cast<unsigned char>(b[j - 1]));
+            curr[j] = std::min({prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (match ? 0 : 1)});
+        }
+        std::swap(prev, curr);
+    }
+    return prev[n];
+}
+
+inline std::string collapse_spaces(const std::string& s) {
+    std::string out;
+    out.reserve(s.size());
+    for (char c : s) {
+        if (c != ' ') out += c;
+    }
+    return out;
+}
+
+inline void apply_vocabulary_spelling_correction(
+    std::string& text,
+    const std::vector<std::string>& custom_vocabulary)
+{
+    if (custom_vocabulary.empty() || text.empty()) return;
+
+    struct VocabEntry {
+        const std::string* original;
+        std::string collapsed;
+    };
+    std::vector<VocabEntry> vocab_entries;
+    vocab_entries.reserve(custom_vocabulary.size());
+    for (const auto& v : custom_vocabulary) {
+        vocab_entries.push_back({&v, collapse_spaces(v)});
+    }
+
+    struct Token { std::string text; bool is_word; };
+    std::vector<Token> tokens;
+    size_t pos = 0;
+    while (pos < text.size()) {
+        if (std::isalnum(static_cast<unsigned char>(text[pos])) ||
+            text[pos] == '\'' || text[pos] == '-') {
+            size_t start = pos;
+            while (pos < text.size() && (std::isalnum(static_cast<unsigned char>(text[pos])) ||
+                                          text[pos] == '\'' || text[pos] == '-')) {
+                ++pos;
+            }
+            tokens.push_back({text.substr(start, pos - start), true});
+        } else {
+            size_t start = pos;
+            while (pos < text.size() && !std::isalnum(static_cast<unsigned char>(text[pos])) &&
+                   text[pos] != '\'' && text[pos] != '-') {
+                ++pos;
+            }
+            tokens.push_back({text.substr(start, pos - start), false});
+        }
+    }
+
+    std::vector<size_t> word_indices;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        if (tokens[i].is_word) word_indices.push_back(i);
+    }
+
+    std::vector<bool> consumed(tokens.size(), false);
+
+    auto strip_suffix = [](const std::string& word) -> std::pair<std::string, std::string> {
+        if (word.size() >= 3 && word.substr(word.size() - 2) == "'s") {
+            return {word.substr(0, word.size() - 2), "'s"};
+        }
+        if (word.size() >= 3 && word.substr(word.size() - 2) == "'t") {
+            return {word.substr(0, word.size() - 2), "'t"};
+        }
+        if (word.size() >= 4 && word.back() == 's' &&
+            word[word.size() - 2] != 's' && // avoid stripping from "boss", "class"
+            std::isalpha(static_cast<unsigned char>(word[word.size() - 2]))) {
+            return {word.substr(0, word.size() - 1), "s"};
+        }
+        return {word, ""};
+    };
+
+    size_t wi = 0;
+    while (wi < word_indices.size()) {
+        size_t best_dist = std::numeric_limits<size_t>::max();
+        const std::string* best_match = nullptr;
+        size_t best_window = 0;
+        size_t best_first_token = 0;
+        size_t best_last_token = 0;
+        std::string best_suffix;
+
+        for (size_t window = std::min<size_t>(3, word_indices.size() - wi); window >= 1; --window) {
+            std::string window_collapsed;
+            const size_t first_tok = word_indices[wi];
+            const size_t last_tok = word_indices[wi + window - 1];
+            for (size_t w = 0; w < window; ++w) {
+                window_collapsed += tokens[word_indices[wi + w]].text;
+            }
+
+            if (window == 1 && window_collapsed.size() < 3) break;
+
+            auto [stem, suffix] = strip_suffix(window_collapsed);
+            const std::string* candidates[] = {&window_collapsed, &stem};
+            const std::string suffixes[] = {"", suffix};
+            const size_t num_candidates = suffix.empty() ? 1 : 2;
+
+            for (size_t ci = 0; ci < num_candidates; ++ci) {
+                const std::string& candidate = *candidates[ci];
+                if (candidate.empty()) continue;
+
+                for (const auto& entry : vocab_entries) {
+                    const size_t wlen = candidate.size();
+                    const size_t vlen = entry.collapsed.size();
+
+                    const size_t len_diff = wlen > vlen ? wlen - vlen : vlen - wlen;
+                    const size_t max_dist = std::max<size_t>(1, std::min(wlen, vlen) / 3);
+                    if (len_diff > max_dist) continue;
+
+                    const size_t dist = levenshtein_ci(candidate, entry.collapsed);
+
+                    // For single-edit corrections, require first char match to prevent
+                    // false positives like "vortex" → "Cortex".
+                    if (dist == 1 && window == 1) {
+                        const bool first_char_match =
+                            std::tolower(static_cast<unsigned char>(candidate[0])) ==
+                            std::tolower(static_cast<unsigned char>(entry.collapsed[0]));
+                        if (!first_char_match) continue;
+                    }
+
+                    if (dist <= max_dist && dist < best_dist) {
+                        best_dist = dist;
+                        best_match = entry.original;
+                        best_window = window;
+                        best_first_token = first_tok;
+                        best_last_token = last_tok;
+                        best_suffix = suffixes[ci];
+                    }
+                }
+            }
+
+            if (best_dist == 0) break;
+        }
+
+        // Allow dist==0 for multi-word merges where word boundaries changed.
+        const bool should_replace = best_match &&
+            best_dist != std::numeric_limits<size_t>::max() &&
+            (best_dist > 0 || best_window > 1);
+
+        if (should_replace) {
+            tokens[best_first_token].text = *best_match + best_suffix;
+            for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
+                consumed[t] = true;
+            }
+            for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
+                if (t > 0) consumed[t - 1] = consumed[t - 1] || !tokens[t - 1].is_word;
+            }
+            wi += best_window;
+        } else {
+            ++wi;
+        }
+    }
+
+    std::string result;
+    result.reserve(text.size());
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        if (!consumed[i]) {
+            result += tokens[i].text;
+        }
+    }
+
+    text = std::move(result);
+}
+
+inline InferenceOptions parse_inference_options_json(const std::string& json) {
+    InferenceOptions options;
+
+    if (json.empty()) return options;
+
     size_t pos = json.find("\"temperature\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        temperature = std::stof(json.substr(pos));
+        options.temperature = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"top_p\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        top_p = std::stof(json.substr(pos));
+        options.top_p = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"top_k\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        top_k = std::stoul(json.substr(pos));
+        options.top_k = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"max_tokens\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        max_tokens = std::stoul(json.substr(pos));
+        options.max_tokens = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"force_tools\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        force_tools = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.force_tools = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"tool_rag_top_k\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        tool_rag_top_k = std::stoul(json.substr(pos));
+        options.tool_rag_top_k = std::stoul(json.substr(pos));
     }
 
     pos = json.find("\"confidence_threshold\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        confidence_threshold = std::stof(json.substr(pos));
+        options.confidence_threshold = std::stof(json.substr(pos));
     }
 
     pos = json.find("\"include_stop_sequences\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        include_stop_sequences = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.include_stop_sequences = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"use_vad\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        use_vad = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.use_vad = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"telemetry_enabled\"");
     if (pos != std::string::npos) {
         pos = json.find(':', pos) + 1;
-        while (pos < json.length() && std::isspace(json[pos])) pos++;
-        telemetry_enabled = (json.substr(pos, 4) == "true");
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.telemetry_enabled = (json.substr(pos, 4) == "true");
     }
 
-    if (auto_handoff) {
-        pos = json.find("\"auto_handoff\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            while (pos < json.length() && std::isspace(json[pos])) pos++;
-            *auto_handoff = (json.substr(pos, 4) == "true");
-        }
+    pos = json.find("\"auto_handoff\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.auto_handoff = (json.substr(pos, 4) == "true");
     }
 
-    if (cloud_timeout_ms) {
-        pos = json.find("\"cloud_timeout_ms\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            *cloud_timeout_ms = std::stoul(json.substr(pos));
-        }
+    pos = json.find("\"cloud_timeout_ms\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        options.cloud_timeout_ms = std::stoul(json.substr(pos));
     }
 
-    if (handoff_with_images) {
-        pos = json.find("\"handoff_with_images\"");
-        if (pos != std::string::npos) {
-            pos = json.find(':', pos) + 1;
-            while (pos < json.length() && std::isspace(json[pos])) pos++;
-            *handoff_with_images = (json.substr(pos, 4) == "true");
-        }
+    pos = json.find("\"handoff_with_images\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.handoff_with_images = (json.substr(pos, 4) == "true");
+    }
+
+    pos = json.find("\"enable_thinking_if_supported\"");
+    if (pos != std::string::npos) {
+        pos = json.find(':', pos) + 1;
+        while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
+        options.enable_thinking_if_supported = (json.substr(pos, 4) == "true");
     }
 
     pos = json.find("\"stop_sequences\"");
@@ -673,12 +1322,14 @@ inline void parse_options_json(const std::string& json,
                 size_t seq_start = seq_pos + 1;
                 size_t seq_end = json.find('"', seq_start);
                 if (seq_end != std::string::npos) {
-                    stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
+                    options.stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
                 }
                 seq_pos = json.find('"', seq_end + 1);
             }
         }
     }
+
+    return options;
 }
 
 static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
@@ -755,7 +1406,6 @@ inline void parse_function_calls_from_response(const std::string& response_text,
 
     gemma::parse_function_calls(regular_response, function_calls);
 
-    // Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
     const std::string QWEN_TOOL_START = "<tool_call>";
     const std::string QWEN_TOOL_END = "</tool_call>";
     size_t qwen_start_pos = 0;
@@ -764,27 +1414,62 @@ inline void parse_function_calls_from_response(const std::string& response_text,
         size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
         size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
 
+        size_t erase_end;
+        std::string json_content;
+
         if (qwen_end_pos != std::string::npos) {
-            std::string json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+            json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+            erase_end = qwen_end_pos + QWEN_TOOL_END.length();
+        } else {
+            json_content = regular_response.substr(content_start);
+            erase_end = regular_response.length();
+        }
 
-            size_t first = json_content.find_first_not_of(" \t\n\r");
-            size_t last = json_content.find_last_not_of(" \t\n\r");
-            if (first != std::string::npos && last != std::string::npos) {
-                json_content = json_content.substr(first, last - first + 1);
-            }
+        size_t first = json_content.find_first_not_of(" \t\n\r");
+        size_t last = json_content.find_last_not_of(" \t\n\r");
+        if (first != std::string::npos && last != std::string::npos) {
+            json_content = json_content.substr(first, last - first + 1);
+        }
 
-            if (json_content.size() > 2 && json_content[0] == '{' &&
-                json_content.find("\"name\"") != std::string::npos) {
-                function_calls.push_back(json_content);
+        if (json_content.size() > 2 && json_content[0] == '{' &&
+            json_content.find("\"name\"") != std::string::npos) {
+            size_t depth = 0;
+            bool in_string = false;
+            bool escaped = false;
+            size_t end_pos = 0;
+            for (size_t c = 0; c < json_content.size(); c++) {
+                char ch = json_content[c];
+                if (escaped) {
+                    escaped = false;
+                    continue;
+                }
+                if (ch == '\\' && in_string) {
+                    escaped = true;
+                    continue;
+                }
+                if (ch == '"') {
+                    in_string = !in_string;
+                    continue;
+                }
+                if (!in_string) {
+                    if (ch == '{') depth++;
+                    else if (ch == '}') {
+                        depth--;
+                        if (depth == 0) {
+                            end_pos = c + 1;
+                            break;
+                        }
+                    }
+                }
+            }
+            if (end_pos > 0) {
+                function_calls.push_back(json_content.substr(0, end_pos));
             }
-
-            regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
-        } else {
-            break;
         }
+
+        regular_response.erase(qwen_start_pos, erase_end - qwen_start_pos);
     }
-    
-    // Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
+
     const std::string TOOL_CALL_START = "<|tool_call_start|>";
     const std::string TOOL_CALL_END = "<|tool_call_end|>";
     size_t tool_start_pos = 0;
@@ -898,6 +1583,95 @@ inline void parse_function_calls_from_response(const std::string& response_text,
     }
 }
 
+inline std::vector<std::pair<size_t, size_t>> find_channel_token_ranges(
+    const std::vector<uint32_t>& tokens, size_t offset,
+    uint32_t channel_open_id, uint32_t channel_close_id) {
+    std::vector<std::pair<size_t, size_t>> ranges;
+    size_t pos = 0;
+    while (pos < tokens.size()) {
+        if (tokens[pos] != channel_open_id) {
+            pos++;
+            continue;
+        }
+
+        size_t block_start = pos;
+        pos++;
+        while (pos < tokens.size() && tokens[pos] != channel_close_id) {
+            pos++;
+        }
+        if (pos < tokens.size()) {
+            pos++;
+        }
+        ranges.push_back({offset + block_start, pos - block_start});
+    }
+    return ranges;
+}
+
+inline void strip_tag_blocks(std::string& text, std::string& extracted,
+                             const std::string& open_tag, const std::string& close_tag) {
+    std::string result;
+    size_t pos = 0;
+
+    size_t first_close = text.find(close_tag);
+    size_t first_open = text.find(open_tag);
+    if (first_close != std::string::npos &&
+        (first_open == std::string::npos || first_close < first_open)) {
+        extracted += text.substr(0, first_close);
+        pos = first_close + close_tag.size();
+    }
+
+    while (pos < text.size()) {
+        size_t open_pos = text.find(open_tag, pos);
+        if (open_pos == std::string::npos) {
+            result += text.substr(pos);
+            break;
+        }
+        result += text.substr(pos, open_pos - pos);
+        size_t content_start = open_pos + open_tag.size();
+        size_t close_pos = text.find(close_tag, content_start);
+        if (close_pos == std::string::npos) {
+            if (!extracted.empty()) extracted += "\n";
+            extracted += text.substr(content_start);
+            break;
+        }
+        if (!extracted.empty()) extracted += "\n";
+        extracted += text.substr(content_start, close_pos - content_start);
+        pos = close_pos + close_tag.size();
+    }
+    text = result;
+}
+
+inline void strip_thinking_block(const std::string& input, std::string& thinking, std::string& content) {
+    thinking.clear();
+    content = input;
+
+    auto trim = [](std::string& s) {
+        size_t first = s.find_first_not_of(" \t\n\r");
+        size_t last = s.find_last_not_of(" \t\n\r");
+        if (first != std::string::npos && last != std::string::npos)
+            s = s.substr(first, last - first + 1);
+        else
+            s.clear();
+    };
+
+    if (content.find("<|channel>") != std::string::npos || content.find("<channel|>") != std::string::npos) {
+        strip_tag_blocks(content, thinking, "<|channel>", "<channel|>");
+    } else if (content.find("<think>") != std::string::npos || content.find("</think>") != std::string::npos) {
+        strip_tag_blocks(content, thinking, "<think>", "</think>");
+    } else {
+        return;
+    }
+
+    trim(thinking);
+    trim(content);
+}
+
+struct TranscriptSegment {
+    float start;
+    float end;
+    std::string text;
+};
+
 inline std::string construct_response_json(const std::string& regular_response,
                                            const std::vector<std::string>& function_calls,
                                            double time_to_first_token,
@@ -907,19 +1681,32 @@ inline std::string construct_response_json(const std::string& regular_response,
                                            size_t prompt_tokens,
                                            size_t completion_tokens,
                                            float confidence = 0.0f,
-                                           bool cloud_handoff = false) {
+                                           bool cloud_handoff = false,
+                                           const std::string& thinking = "",
+                                           const std::vector<TranscriptSegment>& segments = {}) {
     std::ostringstream json;
     json << "{";
     json << "\"success\":true,";
     json << "\"error\":null,";
     json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
     json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
+    if (!thinking.empty()) {
+        json << "\"thinking\":\"" << escape_json_string(thinking) << "\",";
+    }
     json << "\"function_calls\":[";
     for (size_t i = 0; i < function_calls.size(); ++i) {
         if (i > 0) json << ",";
         json << function_calls[i];
     }
     json << "],";
+    json << "\"segments\":[";
+    for (size_t i = 0; i < segments.size(); ++i) {
+        if (i > 0) json << ",";
+        json << "{\"start\":" << std::fixed << std::setprecision(3) << segments[i].start
+             << ",\"end\":" << std::fixed << std::setprecision(3) << segments[i].end
+             << ",\"text\":\"" << escape_json_string(segments[i].text) << "\"}";
+    }
+    json << "],";
     json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
     json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
     json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << total_time_ms << ",";
@@ -945,6 +1732,50 @@ inline std::string serialize_function_calls(const std::vector<std::string>& call
     return oss.str();
 }
 
+inline int validate_audio_params(
+    const char* component,
+    void* model,
+    char* response_buffer, size_t buffer_size,
+    const char* audio_file_path,
+    const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
+    if (!model) {
+        std::string err = last_error_message.empty() ? "Model not initialized." : last_error_message;
+        CACTUS_LOG_ERROR(component, err);
+        handle_error_response(err, response_buffer, buffer_size);
+        return -1;
+    }
+    if (!response_buffer || buffer_size == 0) {
+        CACTUS_LOG_ERROR(component, "Invalid parameters: response_buffer or buffer_size");
+        handle_error_response("Invalid parameters", response_buffer, buffer_size);
+        return -1;
+    }
+    if (!audio_file_path && (!pcm_buffer || pcm_buffer_size == 0)) {
+        CACTUS_LOG_ERROR(component, "No audio input provided");
+        handle_error_response("Either audio_file_path or pcm_buffer must be provided", response_buffer, buffer_size);
+        return -1;
+    }
+    if (audio_file_path && pcm_buffer && pcm_buffer_size > 0) {
+        CACTUS_LOG_ERROR(component, "Both audio_file_path and pcm_buffer provided");
+        handle_error_response("Cannot provide both audio_file_path and pcm_buffer", response_buffer, buffer_size);
+        return -1;
+    }
+    if (pcm_buffer && pcm_buffer_size > 0 && (pcm_buffer_size < 2 || pcm_buffer_size % 2 != 0)) {
+        CACTUS_LOG_ERROR(component, "Invalid pcm_buffer_size");
+        handle_error_response("pcm_buffer_size must be even and at least 2 bytes", response_buffer, buffer_size);
+        return -1;
+    }
+    return 0;
+}
+
+inline std::vector<float> pcm_to_float(const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
+    const int16_t* samples = reinterpret_cast<const int16_t*>(pcm_buffer);
+    size_t n = pcm_buffer_size / 2;
+    std::vector<float> out(n);
+    for (size_t i = 0; i < n; ++i)
+        out[i] = static_cast<float>(samples[i]) / 32768.0f;
+    return out;
+}
+
 } // namespace ffi
 } // namespace cactus
 
@@ -958,4 +1789,4 @@ const char* cactus_get_last_error();
 }
 #endif
 
-#endif // CACTUS_UTILS_H
\ No newline at end of file
+#endif // CACTUS_UTILS_H
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
index c8bf34a..f686fd7 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
@@ -75,6 +75,7 @@ struct Config {
     bool use_pixel_shuffle = false;
     uint32_t pixel_shuffle_factor = 1;
     bool use_image_tokens = false;
+    uint32_t image_token_id = 0;
     bool use_layout_tags = false;
     uint32_t image_seq_len = 64;
 
@@ -107,6 +108,26 @@ struct Config {
     uint32_t subsampling_factor = 0;
     uint32_t num_mel_bins = 80;
     std::string encoder_hidden_act = "silu";
+    uint32_t linear_num_key_heads = 0;
+    uint32_t linear_key_head_dim = 0;
+    uint32_t linear_num_value_heads = 0;
+    uint32_t linear_value_head_dim = 0;
+    uint32_t linear_q_proj_dim = 0;
+    uint32_t linear_k_proj_dim = 0;
+    uint32_t linear_v_proj_dim = 0;
+
+    uint32_t kv_lora_rank = 0;
+    uint32_t q_lora_rank = 0;
+    uint32_t qk_head_dim = 0;
+    uint32_t qk_nope_head_dim = 0;
+    uint32_t qk_rope_head_dim = 0;
+    uint32_t v_head_dim = 0;
+    uint32_t rope_interleave = 0;
+    bool attention_bias = false;
+    float rope_scaling_factor = 1.0f;
+    float rope_mscale_all_dim = 0.0f;
+
+    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, QWEN3P5 = 11, PARAKEET_TDT = 12, GEMMA3N = 13, YOUTU = 14, GEMMA4 = 15, PYANNOTE = 16, WESPEAKER = 17};
     uint32_t predictor_hidden_dim = 0;
     uint32_t predictor_num_layers = 0;
     uint32_t tdt_joint_dim = 0;
@@ -114,7 +135,6 @@ struct Config {
     uint32_t tdt_blank_id = 0;
     std::vector<uint32_t> tdt_durations;
 
-    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
     ModelType model_type = ModelType::QWEN;
 
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -138,6 +158,58 @@ struct Config {
     std::vector<std::string> layer_types;
     size_t conv_L_cache = 0;
 
+    uint32_t altup_num_inputs = 4;
+    uint32_t laurel_rank = 64;
+    uint32_t hidden_size_per_layer_input = 256;
+    uint32_t num_kv_shared_layers = 0;
+    uint32_t sliding_window = 512;
+    float rope_local_base_freq = 10000.0f;
+    float final_logit_softcapping = 0.0f;
+    float global_partial_rotary_factor = 1.0f;
+    uint32_t expert_intermediate_size = 0;
+    uint32_t global_head_dim = 0;
+    uint32_t num_global_kv_heads = 0;
+    bool attention_k_eq_v = false;
+    bool enable_moe_block = false;
+    std::vector<float> activation_sparsity_ppf;
+
+    uint32_t vision_head_dim = 64;
+    uint32_t vision_kv_heads = 12;
+    uint32_t vision_intermediate_size = 3072;
+    uint32_t vision_position_embedding_size = 10240;
+    uint32_t vision_pooling_kernel_size = 3;
+    uint32_t vision_default_output_length = 280;
+    float vision_rope_theta = 100.0f;
+
+    uint32_t audio_hidden_dim = 0;
+    uint32_t audio_num_layers = 0;
+    uint32_t audio_num_heads = 0;
+    uint32_t audio_head_dim = 0;
+    uint32_t audio_input_feat_size = 128;
+    uint32_t audio_conf_conv_kernel_size = 5;
+    uint32_t audio_chunk_size = 12;
+    uint32_t audio_context_left = 13;
+    uint32_t audio_context_right = 0;
+    float audio_logit_cap = 50.0f;
+    float audio_residual_weight = 0.5f;
+    uint32_t audio_output_proj_dims = 0;
+    uint32_t audio_vocab_size = 128;
+    uint32_t audio_vocab_offset = 0;
+    uint32_t audio_soft_tokens = 188;
+    uint32_t audio_sscp_conv0_channels = 128;
+    uint32_t audio_sscp_conv1_channels = 32;
+    float audio_sscp_conv_eps = 1e-3f;
+    float audio_rms_norm_eps = 1e-6f;
+    uint32_t audio_fft_length = 1024;
+    uint32_t audio_token_id = 0;
+    bool audio_fft_overdrive = false;
+    uint32_t channel_open_token_id = 100;
+    uint32_t channel_close_token_id = 101;
+
+    static bool is_gemma_family(ModelType t) {
+        return t == ModelType::GEMMA || t == ModelType::GEMMA3N || t == ModelType::GEMMA4;
+    }
+
     bool from_json(const std::string& json_path);
     std::string to_json() const;
 };
@@ -155,14 +227,38 @@ struct MergeRule {
 };
 
 
+struct ToolCallInfo {
+    std::string name;
+    std::string arguments;
+};
+
 struct ChatMessage {
     std::string role;
     std::string content;
     std::string name;
     std::vector<std::string> images;
+    std::vector<std::string> audio;
+    size_t audio_soft_token_count = 0;
+    std::vector<ToolCallInfo> tool_calls;
 };
 
+struct TokenizerRuntimeConfig {
+    enum class TokenizerType { UNKNOWN, BPE, SENTENCEPIECE };
+    enum class VocabFormat { UNKNOWN, ID_TAB_TOKEN, LINE_TOKEN };
+    enum class Normalizer { NONE, METASPACE, BYTE_LEVEL };
+    enum class Decoder { NONE, REPLACE_METASPACE, BYTE_LEVEL };
+
+    TokenizerType tokenizer_type = TokenizerType::UNKNOWN;
+    VocabFormat vocab_format = VocabFormat::UNKNOWN;
+    Normalizer normalizer = Normalizer::NONE;
+    Decoder decoder = Decoder::NONE;
+    bool byte_fallback = false;
+    bool has_chat_template = false;
+};
 
+TokenizerRuntimeConfig load_tokenizer_runtime_config(const std::string& config_file);
+void load_special_tokens_map(const std::string& config_file, std::unordered_map<std::string, uint32_t>& special_tokens);
+std::vector<std::string> split_with_special_tokens(const std::string& text, const std::unordered_map<std::string, uint32_t>& special_tokens);
 
 class Tokenizer {
 public:
@@ -172,7 +268,7 @@ class Tokenizer {
     virtual std::string decode(const std::vector<uint32_t>& tokens) const = 0;
 
     virtual std::vector<uint32_t> apply_chat_template(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true) const;
-    virtual std::string format_chat_prompt(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true, const std::string& tools_json = "") const;
+    virtual std::string format_chat_prompt(const std::vector<ChatMessage>& messages, bool add_generation_prompt = true, const std::string& tools_json = "", bool enable_thinking_if_supported = true) const;
 
     virtual uint32_t get_vocab_size() const = 0;
     virtual uint32_t get_unk_token() const = 0;
@@ -188,7 +284,7 @@ class Tokenizer {
     uint32_t get_global_img_token_id() const { return global_img_token_id_; }
 
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
+    enum class ModelType { UNKNOWN, QWEN, QWEN3P5, GEMMA, GEMMA4, LFM2, BERT, WHISPER, PARAKEET, YOUTU};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -199,11 +295,21 @@ class Tokenizer {
     uint32_t fake_token_id_ = 49189;
     uint32_t global_img_token_id_ = 49152;
 
+
+    uint32_t vision_patch_size_ = 16;
+    uint32_t vision_pooling_kernel_size_ = 3;
+    uint32_t vision_default_output_length_ = 280;
+    uint32_t vision_image_size_ = 768;
+    TokenizerRuntimeConfig runtime_config_;
+
     void detect_model_type(const std::string& config_path);
-    std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    void load_chat_template(const std::string& template_file);
+    std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json, bool enable_thinking_if_supported = true) const;
     std::string format_gemma_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    std::string format_gemma4_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json, bool enable_thinking_if_supported = true) const;
     std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    std::string format_youtu_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
 };
 
 class BPETokenizer : public Tokenizer {
@@ -245,6 +351,7 @@ class BPETokenizer : public Tokenizer {
     std::string bytes_to_unicode(const std::string& text) const;
     std::string unicode_to_bytes(const std::string& text) const;
     std::vector<std::string> byte_level_split(const std::string& text) const;
+    std::vector<std::string> utf8_split(const std::string& text) const;
 
     void cleanup_mmap();
     
@@ -256,12 +363,6 @@ class BPETokenizer : public Tokenizer {
     std::unordered_map<std::string, uint32_t> special_tokens_;
     std::vector<std::string> split_with_special_tokens(const std::string& text) const;
     void load_special_tokens(const std::string& config_file);
-
-    void load_chat_template(const std::string& template_file);
-
-    std::unordered_map<std::string, uint32_t> tool_tokens_;
-    bool has_tool_support_;
-    void load_tokenizer_config(const std::string& config_file);
 };
 
 class SPTokenizer : public Tokenizer {
@@ -311,8 +412,6 @@ class SPTokenizer : public Tokenizer {
     std::unordered_map<std::string, uint32_t> special_tokens_;
     std::vector<std::string> split_with_special_tokens(const std::string& text) const;
     void load_special_tokens(const std::string& config_file);
-
-    void load_chat_template(const std::string& template_file);
 };
 
 class ConvCache {
@@ -355,8 +454,10 @@ struct KVCache {
     struct LayerCache {
         std::vector<uint8_t> keys;
         std::vector<uint8_t> values;
-        std::vector<float> key_scales;   
-        std::vector<float> value_scales; 
+        std::vector<float> key_scales;
+        std::vector<float> value_scales;
+        size_t head_dim = 0;
+        size_t kv_heads = 0;
     };
 
     std::vector<LayerCache> layer_caches;
@@ -366,8 +467,6 @@ struct KVCache {
     size_t current_seq_len = 0;
     size_t total_seq_len = 0;
     size_t max_seq_len = 2048;
-    size_t num_kv_heads = 0;
-    size_t head_dim = 0;
     size_t num_layers = 0;
     Precision precision;
     size_t element_size = 4;
@@ -375,12 +474,14 @@ struct KVCache {
     void set_window_size(size_t window, size_t sink = DEFAULT_SINK_SIZE);
     size_t get_effective_seq_len() const { return current_seq_len; }
     size_t get_total_seq_len() const { return total_seq_len; }
+    size_t get_layer_head_dim(size_t layer_idx) const { return layer_caches[layer_idx].head_dim; }
+    size_t get_layer_kv_heads(size_t layer_idx) const { return layer_caches[layer_idx].kv_heads; }
 
-    void init(size_t num_layers, size_t max_seq, size_t num_kv_heads, size_t head_dim, Precision model_precision);
+    void init(size_t num_layers, size_t max_seq, const std::vector<size_t>& layer_dims, const std::vector<size_t>& layer_kv_heads, Precision model_precision);
     void reset();
     void update_from_graph(CactusGraph* gb, const std::vector<size_t>& k_nodes,
                           const std::vector<size_t>& v_nodes, size_t seq_len,
-                          size_t num_layers, size_t kv_heads, size_t head_dim);
+                          size_t num_layers);
 
     void update_from_npu(size_t layer_idx, const __fp16* k_data, const __fp16* v_data,
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
@@ -404,6 +505,9 @@ struct KVCache {
     const int8_t* get_values_int8(size_t layer) const;
     const float* get_key_scales(size_t layer) const;
     const float* get_value_scales(size_t layer) const;
+
+    void remove_token_range(size_t start, size_t count);
+    void compact_to_windows(const std::vector<size_t>& target_windows);
 };
 
 class ToolCallConstrainer {
@@ -421,7 +525,7 @@ class ToolCallConstrainer {
         QWEN_EXPECT_ARGS_COLON, 
         QWEN_IN_ARGUMENTS,  
         QWEN_EXPECT_CLOSE_BRACE,
-        QWEN_EXPECT_END, 
+        QWEN_EXPECT_END,
 
         LFM_START,              
         LFM_EXPECT_BRACKET, 
@@ -457,12 +561,17 @@ class ToolCallConstrainer {
     Config::ModelType model_type_ = Config::ModelType::QWEN;
     Tokenizer* tokenizer_ = nullptr;
 
+    bool is_gemma_family() const { return Config::is_gemma_family(model_type_); }
+
     std::vector<std::string> function_names_;
     std::string generated_text_;
-    int brace_depth_ = 0;  
+    int brace_depth_ = 0;
+
+    std::string call_start_tag_;
+    std::string call_end_tag_;
 
-    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_; 
-    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;   
+    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_;
+    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;
     std::unordered_set<uint32_t> open_brace_tokens_;         
     std::unordered_set<uint32_t> close_brace_tokens_;       
     std::unordered_set<uint32_t> colon_tokens_;            
@@ -472,7 +581,7 @@ class ToolCallConstrainer {
     std::unordered_set<uint32_t> quote_tokens_;            
     std::unordered_set<uint32_t> backtick_tokens_;   
     std::unordered_set<uint32_t> all_func_name_tokens_;
-    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  
+    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;
 
     std::unordered_set<uint32_t> tool_start_tokens_;
     std::unordered_set<uint32_t> tool_end_tokens_;
@@ -523,12 +632,16 @@ class Model {
 
     virtual void prefill(const std::vector<uint32_t>& tokens, size_t chunk_size = 256, const std::string& profile_file = "");
 
+    virtual void prefill_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
+                                     const std::string& profile_file = "");
+
     virtual uint32_t decode_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
                                           float temperature = -1.0f, float top_p = -1.0f,
                                           size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
 
     virtual uint32_t decode_with_audio(const std::vector<uint32_t>& tokens, const std::vector<float>& audio_features, float temperature = 0.0f, float top_p = 0.0f,
-                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
+                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr,
+                      float* out_token_time_start = nullptr, float* out_token_time_end = nullptr);
 
     std::vector<float> get_embeddings(const std::vector<uint32_t>& tokens, bool pooled = true, bool normalize = false, const std::string& profile_file = "");
     
@@ -548,13 +661,37 @@ class Model {
     bool has_npu_prefill() const;
     size_t get_prefill_chunk_size() const;
 
+    virtual void remove_thinking_tokens(const std::vector<std::pair<size_t, size_t>>& ranges);
+    virtual void compact_kv_cache() {}
+
     void set_tool_constraints(const std::vector<std::string>& function_names);
     void clear_tool_constraints();
     void update_tool_constraints(uint32_t token_id);
 
     void* graph_handle_;
 
+    void set_vocab_bias(const std::unordered_map<uint32_t, float>& bias) {
+        vocab_bias_ = bias;
+    }
+
+    void clear_vocab_bias() {
+        vocab_bias_.clear();
+    }
+
+    bool has_vocab_bias() const {
+        return !vocab_bias_.empty();
+    }
+
+    const std::unordered_map<uint32_t, float>& get_vocab_bias() const {
+        return vocab_bias_;
+    }
+
 protected:
+    size_t sample_token(CactusGraph* gb, size_t logits_node_id, float temperature, float top_p, size_t top_k,
+                        const std::unordered_map<uint32_t, float>* extra_bias = nullptr) const;
+
+    static void compute_entropy(CactusGraph* gb, size_t logits_node_id, float* out_entropy);
+
     virtual size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) = 0;
     
     virtual size_t forward(const std::vector<float>& audio_features, const std::vector<uint32_t>& tokens, bool use_cache = false);
@@ -569,6 +706,12 @@ class Model {
     virtual size_t build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
                                   ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) = 0;
     void update_kv_cache(CactusGraph* gb, size_t seq_len);
+    virtual std::vector<size_t> get_kv_layer_dims() const {
+        return std::vector<size_t>(config_.num_layers, config_.attention_head_dim);
+    }
+    virtual std::vector<size_t> get_kv_layer_heads() const {
+        return std::vector<size_t>(config_.num_layers, config_.attention_kv_heads);
+    }
     virtual void post_init() {}
     virtual void post_execute_updates(CactusGraph*, size_t) {}
     Config config_;
@@ -601,6 +744,9 @@ class Model {
     virtual std::vector<__fp16> get_token_embeddings(const std::vector<uint32_t>& tokens);
 
     ToolCallConstrainer tool_constrainer_;
+
+private:
+    std::unordered_map<uint32_t, float> vocab_bias_;
 };
 
 std::unique_ptr<Model> create_model(const std::string& model_folder);
@@ -705,13 +851,17 @@ class AudioProcessor {
         bool remove_dc_offset = false;
         float preemphasis = 0.0f;
         bool hann_periodic = true;
+        float window_a0 = 0.5f;
+        size_t fft_override = 0;
+        bool mel_floor_additive = false;
     };
 
     AudioProcessor();
     ~AudioProcessor();
 
     void init_mel_filters(size_t num_frequency_bins, size_t num_mel_filters,
-                          float min_freq, float max_freq, size_t sampling_rate);
+                          float min_freq, float max_freq, size_t sampling_rate,
+                          const char* norm = "slaney", const char* mel_scale = "slaney");
 
     std::vector<float> compute_spectrogram(
         const std::vector<float>& waveform,
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h
index 912de57..f0f9fe2 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h
@@ -53,6 +53,7 @@ inline std::string format_argument(const std::string& json, size_t& pos, bool es
     char c = json[pos];
 
     if (c == '"') {
+        pos++;
         std::string value = extract_json_string(json, pos);
         return escape(value);
     } else if (c == '{') {
@@ -240,7 +241,7 @@ inline std::string format_parameters(const std::string& properties_json, const s
                 result += ",properties:{" + format_parameters(prop_obj["properties"], nested_required) + "}";
             }
             if (prop_obj.count("required")) {
-                result += ",required:[";
+                std::string req_items;
                 size_t req_pos = 0;
                 skip_whitespace(prop_obj["required"], req_pos);
                 if (req_pos < prop_obj["required"].length() && prop_obj["required"][req_pos] == '[') {
@@ -253,13 +254,15 @@ inline std::string format_parameters(const std::string& properties_json, const s
                         if (prop_obj["required"][req_pos] == '"') {
                             req_pos++;
                             std::string req_item = extract_json_string(prop_obj["required"], req_pos);
-                            if (!req_first) result += ",";
+                            if (!req_first) req_items += ",";
                             req_first = false;
-                            result += escape(req_item);
+                            req_items += escape(req_item);
                         }
                     }
                 }
-                result += "]";
+                if (!req_items.empty()) {
+                    result += ",required:[" + req_items + "]";
+                }
             }
         } else if (to_upper(type_val) == "ARRAY") {
             if (prop_obj.count("items")) {
@@ -342,7 +345,7 @@ inline std::string format_function_declaration(const std::string& name,
         }
 
         if (params.count("required")) {
-            result += ",required:[";
+            std::string req_items;
             size_t req_pos = 0;
             skip_whitespace(params["required"], req_pos);
             if (req_pos < params["required"].length() && params["required"][req_pos] == '[') {
@@ -355,13 +358,15 @@ inline std::string format_function_declaration(const std::string& name,
                     if (params["required"][req_pos] == '"') {
                         req_pos++;
                         std::string item = extract_json_string(params["required"], req_pos);
-                        if (!first) result += ",";
+                        if (!first) req_items += ",";
                         first = false;
-                        result += escape(item);
+                        req_items += escape(item);
                     }
                 }
             }
-            result += "]";
+            if (!req_items.empty()) {
+                result += ",required:[" + req_items + "]";
+            }
         }
 
         if (params.count("type")) {
@@ -377,12 +382,15 @@ inline std::string format_function_declaration(const std::string& name,
 }
 
 template<typename ToolFunction>
-inline std::string format_tools(const std::vector<ToolFunction>& tools) {
+inline std::string format_tools(const std::vector<ToolFunction>& tools, bool use_pipe_tags = false) {
     if (tools.empty()) return "";
 
+    const char* decl_start = use_pipe_tags ? "<|tool>" : "<start_function_declaration>";
+    const char* decl_end   = use_pipe_tags ? "<tool|>" : "<end_function_declaration>";
+
     std::string result;
     for (const auto& tool : tools) {
-        result += "<start_function_declaration>";
+        result += decl_start;
         std::string params_json;
         auto it = tool.parameters.find("schema");
         if (it != tool.parameters.end()) {
@@ -390,12 +398,26 @@ inline std::string format_tools(const std::vector<ToolFunction>& tools) {
         }
 
         result += format_function_declaration(tool.name, tool.description, params_json);
-        result += "<end_function_declaration>";
+        result += decl_end;
     }
     return result;
 }
 
 
+inline size_t match_quote_tag(const std::string& s, size_t pos) {
+    if (s.compare(pos, 8, "<escape>") == 0) return 8;
+    if (s.compare(pos, 5, "<|\"|>") == 0) return 5;
+    return 0;
+}
+
+inline size_t find_quote_tag(const std::string& s, size_t pos) {
+    size_t e = s.find("<escape>", pos);
+    size_t t = s.find("<|\"|>", pos);
+    if (e == std::string::npos) return t;
+    if (t == std::string::npos) return e;
+    return std::min(e, t);
+}
+
 inline std::string unescape(const std::string& s) {
     const std::string ESCAPE_TAG = "<escape>";
     std::string result = s;
@@ -427,12 +449,13 @@ inline std::string args_to_json(const std::string& args_content) {
         while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
 
         if (pos < args_content.length()) {
-            if (args_content.compare(pos, 8, "<escape>") == 0) {
-                pos += 8; 
-                size_t val_end = args_content.find("<escape>", pos);
+            size_t qtag_len = match_quote_tag(args_content, pos);
+            if (qtag_len > 0) {
+                pos += qtag_len;
+                size_t val_end = find_quote_tag(args_content, pos);
                 if (val_end != std::string::npos) {
                     value = "\"" + args_content.substr(pos, val_end - pos) + "\"";
-                    pos = val_end + 8; 
+                    pos = val_end + match_quote_tag(args_content, val_end);
                 }
             } else if (args_content[pos] == '{') {
                 int depth = 1;
@@ -464,12 +487,13 @@ inline std::string args_to_json(const std::string& args_content) {
                     if (!first_item) value += ",";
                     first_item = false;
 
-                    if (arr_content.compare(arr_pos, 8, "<escape>") == 0) {
-                        arr_pos += 8;
-                        size_t end = arr_content.find("<escape>", arr_pos);
+                    size_t aq_len = match_quote_tag(arr_content, arr_pos);
+                    if (aq_len > 0) {
+                        arr_pos += aq_len;
+                        size_t end = find_quote_tag(arr_content, arr_pos);
                         if (end != std::string::npos) {
                             value += "\"" + arr_content.substr(arr_pos, end - arr_pos) + "\"";
-                            arr_pos = end + 8;
+                            arr_pos = end + match_quote_tag(arr_content, end);
                         }
                     } else {
                         size_t end = arr_content.find_first_of(",]", arr_pos);
@@ -499,8 +523,11 @@ inline std::string args_to_json(const std::string& args_content) {
 }
 
 inline void parse_function_calls(std::string& response, std::vector<std::string>& function_calls) {
-    const std::string CALL_START = "<start_function_call>";
-    const std::string CALL_END = "<end_function_call>";
+
+    const std::string CALL_START = (response.find("<|tool_call>") != std::string::npos)
+        ? "<|tool_call>" : "<start_function_call>";
+    const std::string CALL_END = (CALL_START == "<|tool_call>")
+        ? "<tool_call|>" : "<end_function_call>";
     size_t pos = 0;
 
     while ((pos = response.find(CALL_START, pos)) != std::string::npos) {
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
index 01b7b2f..020412a 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
@@ -122,13 +122,14 @@ enum class Activation {
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
+    ABS, POW, FLATTEN, VIEW,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
     RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
     SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
     RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
-    SAMPLE, CONCAT,
+    SAMPLE, CONCAT, CAT,
     SCATTER_TOPK,
     TOPK, LAYERNORM, GROUPNORM,
     MOE_LAYER,
@@ -136,7 +137,17 @@ enum class OpType {
     PERSISTENT,
     QUANTIZE_ACTIVATIONS,
     LSTM_CELL,
-    STFT
+    GATED_DELTANET_DECODE,
+    GATED_DELTANET_PREFILL,
+    STFT,
+    ALTUP_PREDICT,
+    ALTUP_CORRECT,
+    GAUSSIAN_TOPK,
+    MAXPOOL1D,
+    BILSTM_SEQUENCE,
+    LEAKY_RELU,
+    CONV2D_K3S1P1,
+    STATS_POOL
 };
 
 struct PrecisionTraits {
@@ -315,6 +326,7 @@ struct OpParams {
     size_t window_size = 0;
     bool is_causal = true;  
     bool attention_mask_is_additive = false;
+    float logit_cap = 0.0f;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -350,6 +362,10 @@ struct OpParams {
     size_t num_kv_heads = 0;
     size_t head_dim = 0;
     size_t num_fft_bins = 0;
+    size_t chunk_size = 0;
+    size_t num_altup_inputs = 0;
+    size_t v_head_dim = 0;
+    size_t kernel_size = 0;
 };
 
 struct GraphNode {
@@ -362,6 +378,28 @@ struct GraphNode {
     GraphNode(size_t node_id, OpType type);
 };
 
+using nodes_vector = std::vector<std::unique_ptr<GraphNode>>;
+using node_index_map_t = std::unordered_map<size_t, size_t>;
+
+inline const BufferDesc& get_input(const GraphNode& node, size_t idx,
+                                   const nodes_vector& nodes,
+                                   const node_index_map_t& node_index_map) {
+    return nodes[node_index_map.at(node.input_ids[idx])]->output_buffer;
+}
+
+struct AxisDims {
+    size_t outer, axis_size, inner;
+    static AxisDims from_shape(const std::vector<size_t>& shape, size_t axis) {
+        AxisDims d;
+        d.outer = 1;
+        for (size_t i = 0; i < axis; i++) d.outer *= shape[i];
+        d.axis_size = shape[axis];
+        d.inner = 1;
+        for (size_t i = axis + 1; i < shape.size(); i++) d.inner *= shape[i];
+        return d;
+    }
+};
+
 template<typename T>
 void dispatch_binary_op(OpType op, const T* lhs, const T* rhs, T* output, size_t count);
 
@@ -383,6 +421,14 @@ void compute_groupnorm_node(GraphNode& node, const std::vector<std::unique_ptr<G
 void compute_persistent_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_decode_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_prefill_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_predict_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_correct_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_maxpool1d_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_bilstm_sequence_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_conv2d_k3s1p1_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_stats_pool_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 
 void shrink_thread_local_buffers();
 class BufferPool {
@@ -437,7 +483,6 @@ class CactusGraph {
     size_t multiply(size_t input1, size_t input2);
     size_t divide(size_t input1, size_t input2);
     
-    
     size_t scalar_add(size_t input, float value);
     size_t scalar_subtract(size_t input, float value);
     size_t scalar_multiply(size_t input, float value);
@@ -455,6 +500,11 @@ class CactusGraph {
     size_t sigmoid(size_t input);
     size_t tanh(size_t input);
     size_t glu(size_t input, int axis = -1);
+
+    size_t abs(size_t input);
+    size_t pow(size_t input, float exponent);
+    size_t view(size_t input, const std::vector<size_t>& new_shape);
+    size_t flatten(size_t input, int start_dim = 0, int end_dim = -1);
     
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -497,7 +547,9 @@ class CactusGraph {
                      size_t num_experts_per_tok,
                      bool normalize_routing,
                      float epsilon,
-                     float routed_scaling_factor);
+                     float routed_scaling_factor,
+                     Activation activation = Activation::SILU,
+                     size_t per_expert_scale = 0);
     size_t moe_layer(size_t hidden,
                      size_t routing_probs,
                      size_t topk_indices,
@@ -518,13 +570,15 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
                             bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
-                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0,
+                            float logit_cap = 0.0f);
     size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
 
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
                                  const float* k_scales, const float* v_scales,
-                                 size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size = 0);
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim,
+                                 size_t window_size = 0, size_t v_head_dim = 0);
 
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
@@ -543,12 +597,30 @@ class CactusGraph {
     size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
 
     size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
+    size_t gated_deltanet_decode(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                 size_t initial_state, float scale = 0.0f);
+    size_t gated_deltanet_prefill(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                  size_t initial_state, size_t chunk_size = 64, float scale = 0.0f);
     size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
 
+    size_t altup_predict(size_t coefs, const size_t* streams, size_t num_streams);
+    size_t altup_correct(size_t coefs, size_t innovation, const size_t* predictions, size_t num_predictions);
+
+    size_t gaussian_topk(size_t input, float ppf);
+
+    size_t maxpool1d(size_t input, size_t kernel_size, size_t stride);
+    size_t leaky_relu(size_t input, float negative_slope = 0.01f);
+    size_t bilstm_sequence(size_t input, size_t w_ih_fwd, size_t w_hh_fwd, size_t b_ih_fwd, size_t b_hh_fwd,
+                           size_t w_ih_bwd, size_t w_hh_bwd, size_t b_ih_bwd, size_t b_hh_bwd);
+    size_t conv2d_k3s1p1(size_t input, size_t weight);
+    size_t conv2d_k3s1p1(size_t input, size_t weight, size_t bias);
+    size_t stats_pool(size_t input);
+
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
     
     size_t concat(size_t input1, size_t input2, int axis = 0);
+    size_t cat(const std::vector<size_t>& inputs, int axis);
     size_t scatter_topk(size_t indices, size_t values, size_t num_classes);
     
     void set_input(size_t node_id, const void* data, Precision precision);
@@ -653,4 +725,4 @@ namespace GraphFile {
     };
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
index 0ec7265..77f950a 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
@@ -11,7 +11,9 @@ enum class ScalarOpType {
     SUBTRACT,
     MULTIPLY,
     DIVIDE,
+    ABS,
     EXP,
+    POW,
     SQRT,
     COS,
     SIN,
@@ -54,6 +56,14 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
                         const int8_t* B, const __fp16* B_scales,
                         __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
+void cactus_gemv_int8_i8mm(const int8_t* A, float A_scale,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t K, size_t N, size_t group_size);
+
+void cactus_gemm_int8_i8mm(const int8_t* A, const float* A_scales,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
 void cactus_gemv_int4(const int8_t* A, float A_scale,
                       const int8_t* B_packed, const __fp16* B_scales,
                       __fp16* C, size_t K, size_t N, size_t group_size);
@@ -97,6 +107,9 @@ void cactus_max_axis_f16(const __fp16* input, __fp16* output, size_t outer_size,
 void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* output,
                           size_t batch_size, size_t dims, float eps);
 
+void cactus_layer_norm_f16(const __fp16* input, const __fp16* weight, const __fp16* bias,
+                            __fp16* output, size_t batch_size, size_t dims, float eps);
+
 void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
                       size_t num_heads, size_t head_dim, size_t start_pos, float theta);
 
@@ -108,6 +121,8 @@ void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
 
 void cactus_relu_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
+void cactus_leaky_relu_f16(const __fp16* input, __fp16* output, size_t num_elements, float negative_slope);
+
 void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
 void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
@@ -163,21 +178,54 @@ void cactus_batchnorm_f32(
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
+                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false,
+                          size_t v_head_dim = 0, float logit_cap = 0.0f);
 
 void cactus_attention_hybrid_int8_fp16(
-    const __fp16* queries,        
-    const int8_t* keys_cached, 
-    const int8_t* values_cached, 
+    const __fp16* queries,
+    const int8_t* keys_cached,
+    const int8_t* values_cached,
     const float* k_scales,
-    const float* v_scales, 
-    const __fp16* keys_new, 
-    const __fp16* values_new, 
+    const float* v_scales,
+    const __fp16* keys_new,
+    const __fp16* values_new,
     __fp16* output,
     size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
     size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
     float scale, size_t position_offset = 0, bool is_causal = true, size_t window_size = 0,
-    size_t group_size = KV_QUANT_GROUP_SIZE);
+    size_t group_size = KV_QUANT_GROUP_SIZE, size_t v_head_dim = 0);
+
+void cactus_gated_deltanet_decode_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    float scale);
+
+void cactus_gated_deltanet_prefill_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t T,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    size_t requested_chunk_size,
+    float scale);
 
 void cactus_conv1d_causal_depthwise_f16(
     const __fp16* input,
@@ -244,6 +292,18 @@ void cactus_conv1d_same_depthwise_f16_k9(
     size_t C
 );
 
+void cactus_conv2d_f16_k3s1p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
 void cactus_conv2d_f16_k3s2p1_nchw(
     const __fp16* input,
     const __fp16* weight,
@@ -305,6 +365,8 @@ void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size
 void cactus_concat_f16(const __fp16* input1, const __fp16* input2, __fp16* output,
                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
                        size_t ndims, int axis);
+void cactus_cat_f16(const __fp16** inputs, __fp16* output, const size_t** input_shapes,
+                      const size_t* output_shape, size_t num_inputs, size_t rank, int axis);
 
 void cactus_int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
 void cactus_fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
@@ -328,6 +390,30 @@ inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim,
 
 void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
 
+void cactus_gaussian_topk_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t rows,
+    size_t cols,
+    float ppf);
+
+void cactus_altup_predict_f16(
+    const __fp16* coefs,
+    const __fp16* const* streams,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
+
+void cactus_altup_correct_f16(
+    const __fp16* coefs,
+    const __fp16* innovation,
+    const __fp16* const* predictions,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
+
 void cactus_lstm_cell_f16(
     const __fp16* x_input,
     const __fp16* h_prev,
@@ -343,4 +429,31 @@ void cactus_lstm_cell_f16(
     size_t hidden_size
 );
 
-#endif
\ No newline at end of file
+void cactus_bilstm_sequence_f16(
+    const __fp16* input,
+    const __fp16* weight_ih_fwd,
+    const __fp16* weight_hh_fwd,
+    const __fp16* bias_ih_fwd,
+    const __fp16* bias_hh_fwd,
+    const __fp16* weight_ih_bwd,
+    const __fp16* weight_hh_bwd,
+    const __fp16* bias_ih_bwd,
+    const __fp16* bias_hh_bwd,
+    __fp16* output,
+    size_t batch_size,
+    size_t seq_len,
+    size_t input_size,
+    size_t hidden_size
+);
+
+void cactus_maxpool1d_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t batch_size,
+    size_t channels,
+    size_t input_length,
+    size_t kernel_size,
+    size_t stride
+);
+
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
index 118c85c..2d7a7a1 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
@@ -9,6 +9,8 @@
 #if defined(__ANDROID__)
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
+#include <sched.h>
+#include <fstream>
 #endif
 #include <algorithm>
 #include <cmath>
@@ -44,6 +46,29 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
 #endif
 }
 
+inline bool cpu_has_i8mm() {
+#if defined(__aarch64__)
+    static std::once_flag once;
+    static bool has = false;
+
+    std::call_once(once, []() {
+#if defined(__APPLE__)
+    has = true;
+#elif defined(__ANDROID__)
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    #ifndef HWCAP2_I8MM
+    #define HWCAP2_I8MM (1 << 13)
+    #endif
+    has = (hwcap2 & HWCAP2_I8MM) != 0;
+#endif
+    });
+
+    return has;
+#else
+    return false;
+#endif
+}
+
 inline bool cpu_has_sme2() {
 #if defined(__aarch64__)
 	static std::once_flag once;
@@ -130,6 +155,33 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
     return result;
 }
 
+constexpr size_t SIMD_F16_WIDTH = 8;
+
+inline size_t simd_align(size_t count, size_t width = SIMD_F16_WIDTH) {
+    return (count / width) * width;
+}
+
+inline void f16x8_split_f32(float16x8_t v, float32x4_t& lo, float32x4_t& hi) {
+    lo = vcvt_f32_f16(vget_low_f16(v));
+    hi = vcvt_f32_f16(vget_high_f16(v));
+}
+
+inline float16x8_t f32_merge_f16(float32x4_t lo, float32x4_t hi) {
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));
+}
+
+inline float32x4_t fast_sigmoid_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    return vdivq_f32(one, vaddq_f32(one, fast_exp_f32x4(vnegq_f32(x))));
+}
+
+template<typename F32x4Op>
+inline float16x8_t apply_f32_op_on_f16x8(float16x8_t v, F32x4Op op) {
+    float32x4_t lo, hi;
+    f16x8_split_f32(v, lo, hi);
+    return f32_merge_f16(op(lo), op(hi));
+}
+
 inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
     int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
     high_decoded = vshrq_n_s8(packed, 4);
@@ -138,6 +190,80 @@ inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded
 
 namespace CactusThreading {
 
+#if defined(__ANDROID__)
+    struct CoreTopology {
+        std::vector<int> performance_cores;  
+        std::vector<int> all_cores;
+
+        static CoreTopology& get() {
+            static CoreTopology topo = detect();
+            return topo;
+        }
+
+    private:
+        static int read_sysfs_int(const char* path) {
+            std::ifstream f(path);
+            if (!f.is_open()) return -1;
+            int val = -1;
+            f >> val;
+            return val;
+        }
+
+        static CoreTopology detect() {
+            CoreTopology topo;
+            constexpr int MAX_CPUS = 16;
+            std::vector<std::pair<int, int>> core_caps; 
+
+            for (int i = 0; i < MAX_CPUS; ++i) {
+                char path[128];
+
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpu_capacity", i);
+                int cap = read_sysfs_int(path);
+                if (cap > 0) {
+                    core_caps.push_back({i, cap});
+                    topo.all_cores.push_back(i);
+                    continue;
+                }
+
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", i);
+                int freq = read_sysfs_int(path);
+                if (freq > 0) {
+                    core_caps.push_back({i, freq});
+                    topo.all_cores.push_back(i);
+                }
+            }
+
+            if (core_caps.empty()) return topo;
+
+            int max_cap = 0;
+            for (auto& [id, cap] : core_caps) {
+                max_cap = std::max(max_cap, cap);
+            }
+
+            int threshold = static_cast<int>(max_cap * 0.70);
+            for (auto& [id, cap] : core_caps) {
+                if (cap >= threshold) {
+                    topo.performance_cores.push_back(id);
+                }
+            }
+
+            return topo;
+        }
+    };
+
+    inline bool pin_current_thread_to_cores(const std::vector<int>& cores) {
+        if (cores.empty()) return false;
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+        for (int core : cores) {
+            CPU_SET(core, &mask);
+        }
+        return sched_setaffinity(0, sizeof(mask), &mask) == 0;
+    }
+#endif
+
     class ThreadPool {
     private:
         static constexpr size_t MAX_WORKERS = 16;
@@ -184,9 +310,25 @@ namespace CactusThreading {
             : stop(false), pending_tasks(0) {
             num_workers_ = std::min(num_threads, MAX_WORKERS);
             if (num_workers_ == 0) num_workers_ = 1;
+
+#if defined(__ANDROID__)
+            auto& topo = CoreTopology::get();
+            if (!topo.performance_cores.empty()) {
+                num_workers_ = std::min(num_workers_, topo.performance_cores.size());
+            }
+#endif
+
             workers.reserve(num_workers_);
             for (size_t i = 0; i < num_workers_; ++i) {
-                workers.emplace_back(&ThreadPool::worker_thread, this);
+                workers.emplace_back([this]() {
+#if defined(__ANDROID__)
+                    auto& perf = CoreTopology::get().performance_cores;
+                    if (!perf.empty()) {
+                        pin_current_thread_to_cores(perf);
+                    }
+#endif
+                    worker_thread();
+                });
             }
         }
 
@@ -498,5 +640,52 @@ namespace CactusThreading {
 
 }
 
+template<typename SimdOp, typename ScalarOp>
+void elementwise_op_f16(const __fp16* input, __fp16* output, size_t num_elements,
+                        bool use_streaming, CactusThreading::ParallelConfig config,
+                        SimdOp simd_op, ScalarOp scalar_op, size_t unroll = 4) {
+    CactusThreading::parallel_for(num_elements, config,
+        [&](size_t start, size_t end) {
+            const size_t n = end - start;
+            const size_t vec_end = start + simd_align(n);
+
+            if (use_streaming && unroll >= 4) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 4);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 4) {
+                    __builtin_prefetch(&input[i + 256], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    float16x8_t v2 = simd_op(vld1q_f16(&input[i + 16]));
+                    float16x8_t v3 = simd_op(vld1q_f16(&input[i + 24]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                    stream_store_f16x8(&output[i + 16], v2);
+                    stream_store_f16x8(&output[i + 24], v3);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else if (use_streaming && unroll >= 2) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 2);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 2) {
+                    __builtin_prefetch(&input[i + 128], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else {
+                for (size_t i = start; i < vec_end; i += SIMD_F16_WIDTH) {
+                    vst1q_f16(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            }
+            for (size_t i = vec_end; i < end; ++i) {
+                output[i] = scalar_op(input[i]);
+            }
+        });
+}
 
-#endif // KERNEL_UTILS_H 
+#endif // KERNEL_UTILS_H
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus
index e521921..6dcd3a6 100755
Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus differ
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
index e3b9763..40b89eb 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
@@ -16,6 +16,7 @@ namespace margelo::nitro::cactus {
     registerHybrids(this, [](Prototype& prototype) {
       prototype.registerHybridMethod("init", &HybridCactusSpec::init);
       prototype.registerHybridMethod("complete", &HybridCactusSpec::complete);
+      prototype.registerHybridMethod("prefill", &HybridCactusSpec::prefill);
       prototype.registerHybridMethod("tokenize", &HybridCactusSpec::tokenize);
       prototype.registerHybridMethod("scoreWindow", &HybridCactusSpec::scoreWindow);
       prototype.registerHybridMethod("transcribe", &HybridCactusSpec::transcribe);
@@ -27,6 +28,8 @@ namespace margelo::nitro::cactus {
       prototype.registerHybridMethod("embed", &HybridCactusSpec::embed);
       prototype.registerHybridMethod("imageEmbed", &HybridCactusSpec::imageEmbed);
       prototype.registerHybridMethod("audioEmbed", &HybridCactusSpec::audioEmbed);
+      prototype.registerHybridMethod("diarize", &HybridCactusSpec::diarize);
+      prototype.registerHybridMethod("embedSpeaker", &HybridCactusSpec::embedSpeaker);
       prototype.registerHybridMethod("reset", &HybridCactusSpec::reset);
       prototype.registerHybridMethod("stop", &HybridCactusSpec::stop);
       prototype.registerHybridMethod("destroy", &HybridCactusSpec::destroy);
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
index f44ffbe..63dceaa 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
@@ -55,6 +55,7 @@ namespace margelo::nitro::cactus {
       // Methods
       virtual std::shared_ptr<Promise<void>> init(const std::string& modelPath, const std::optional<std::string>& corpusDir, std::optional<bool> cacheIndex) = 0;
       virtual std::shared_ptr<Promise<std::string>> complete(const std::string& messagesJson, double responseBufferSize, const std::optional<std::string>& optionsJson, const std::optional<std::string>& toolsJson, const std::optional<std::function<void(const std::string& /* token */, double /* tokenId */)>>& callback) = 0;
+      virtual std::shared_ptr<Promise<std::string>> prefill(const std::string& messagesJson, double responseBufferSize, const std::optional<std::string>& optionsJson, const std::optional<std::string>& toolsJson) = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> tokenize(const std::string& text) = 0;
       virtual std::shared_ptr<Promise<std::string>> scoreWindow(const std::vector<double>& tokens, double start, double end, double context) = 0;
       virtual std::shared_ptr<Promise<std::string>> transcribe(const std::variant<std::vector<double>, std::string>& audio, const std::string& prompt, double responseBufferSize, const std::optional<std::string>& optionsJson, const std::optional<std::function<void(const std::string& /* token */, double /* tokenId */)>>& callback) = 0;
@@ -66,6 +67,8 @@ namespace margelo::nitro::cactus {
       virtual std::shared_ptr<Promise<std::vector<double>>> embed(const std::string& text, double embeddingBufferSize, bool normalize) = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> imageEmbed(const std::string& imagePath, double embeddingBufferSize) = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> audioEmbed(const std::string& audioPath, double embeddingBufferSize) = 0;
+      virtual std::shared_ptr<Promise<std::string>> diarize(const std::variant<std::vector<double>, std::string>& audio, double responseBufferSize, const std::optional<std::string>& optionsJson) = 0;
+      virtual std::shared_ptr<Promise<std::string>> embedSpeaker(const std::variant<std::vector<double>, std::string>& audio, double responseBufferSize, const std::optional<std::string>& optionsJson) = 0;
       virtual std::shared_ptr<Promise<void>> reset() = 0;
       virtual std::shared_ptr<Promise<void>> stop() = 0;
       virtual std::shared_ptr<Promise<void>> destroy() = 0;
diff --git a/package.json b/package.json
index 2383490..76e9dc5 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "cactus-react-native",
-  "version": "1.10.4",
+  "version": "1.12.0",
   "description": "Run AI models locally on mobile devices",
   "main": "./lib/module/index.js",
   "types": "./lib/typescript/src/index.d.ts",
diff --git a/src/classes/CactusVAD.ts b/src/classes/CactusAudio.ts
similarity index 72%
rename from src/classes/CactusVAD.ts
rename to src/classes/CactusAudio.ts
index 275a7e2..8add5b7 100644
--- a/src/classes/CactusVAD.ts
+++ b/src/classes/CactusAudio.ts
@@ -1,14 +1,18 @@
 import { Cactus, CactusFileSystem } from '../native';
 import type {
-  CactusVADParams,
-  CactusVADDownloadParams,
-  CactusVADVadParams,
-  CactusVADResult,
-} from '../types/CactusVAD';
+  CactusAudioParams,
+  CactusAudioDownloadParams,
+  CactusAudioVADParams,
+  CactusAudioVADResult,
+  CactusAudioDiarizeParams,
+  CactusAudioDiarizeResult,
+  CactusAudioEmbedSpeakerParams,
+  CactusAudioEmbedSpeakerResult,
+} from '../types/CactusAudio';
 import { getRegistry } from '../modelRegistry';
 import type { CactusModel } from '../types/common';
 
-export class CactusVAD {
+export class CactusAudio {
   private readonly cactus = new Cactus();
 
   private readonly model: string;
@@ -26,25 +30,25 @@ export class CactusVAD {
     pro: false,
   };
 
-  constructor({ model, options }: CactusVADParams = {}) {
-    this.model = model ?? CactusVAD.defaultModel;
+  constructor({ model, options }: CactusAudioParams = {}) {
+    this.model = model ?? CactusAudio.defaultModel;
     this.options = {
       quantization:
-        options?.quantization ?? CactusVAD.defaultOptions.quantization,
-      pro: options?.pro ?? CactusVAD.defaultOptions.pro,
+        options?.quantization ?? CactusAudio.defaultOptions.quantization,
+      pro: options?.pro ?? CactusAudio.defaultOptions.pro,
     };
   }
 
   public async download({
     onProgress,
-  }: CactusVADDownloadParams = {}): Promise<void> {
+  }: CactusAudioDownloadParams = {}): Promise<void> {
     if (this.isModelPath(this.model)) {
       onProgress?.(1.0);
       return;
     }
 
     if (this.isDownloading) {
-      throw new Error('CactusVAD is already downloading');
+      throw new Error('CactusAudio is already downloading');
     }
 
     if (await CactusFileSystem.modelExists(this.getModelName())) {
@@ -101,11 +105,26 @@ export class CactusVAD {
   public async vad({
     audio,
     options,
-  }: CactusVADVadParams): Promise<CactusVADResult> {
+  }: CactusAudioVADParams): Promise<CactusAudioVADResult> {
     await this.init();
     return this.cactus.vad(audio, options);
   }
 
+  public async diarize({
+    audio,
+    options,
+  }: CactusAudioDiarizeParams): Promise<CactusAudioDiarizeResult> {
+    await this.init();
+    return this.cactus.diarize(audio, options);
+  }
+
+  public async embedSpeaker({
+    audio,
+  }: CactusAudioEmbedSpeakerParams): Promise<CactusAudioEmbedSpeakerResult> {
+    await this.init();
+    return this.cactus.embedSpeaker(audio);
+  }
+
   public async destroy(): Promise<void> {
     if (!this.isInitialized) {
       return;
diff --git a/src/classes/CactusLM.ts b/src/classes/CactusLM.ts
index 69c1ebf..252af00 100644
--- a/src/classes/CactusLM.ts
+++ b/src/classes/CactusLM.ts
@@ -3,6 +3,8 @@ import type {
   CactusLMDownloadParams,
   CactusLMCompleteParams,
   CactusLMCompleteResult,
+  CactusLMPrefillParams,
+  CactusLMPrefillResult,
   CactusLMTokenizeParams,
   CactusLMTokenizeResult,
   CactusLMScoreWindowParams,
@@ -159,6 +161,40 @@ export class CactusLM {
     }
   }
 
+  public async prefill({
+    messages,
+    options,
+    tools,
+  }: CactusLMPrefillParams): Promise<CactusLMPrefillResult> {
+    if (this.isGenerating) {
+      throw new Error('CactusLM is already generating');
+    }
+
+    options = { ...CactusLM.defaultCompleteOptions, ...options };
+    const toolsInternal = tools?.map((tool) => ({
+      type: 'function' as const,
+      function: tool,
+    }));
+
+    const responseBufferSize =
+      8 * (options.maxTokens ?? CactusLM.defaultCompleteOptions.maxTokens) +
+      256;
+
+    await this.init();
+
+    this.isGenerating = true;
+    try {
+      return await this.cactus.prefill(
+        messages,
+        responseBufferSize,
+        options,
+        toolsInternal
+      );
+    } finally {
+      this.isGenerating = false;
+    }
+  }
+
   public async tokenize({
     text,
   }: CactusLMTokenizeParams): Promise<CactusLMTokenizeResult> {
diff --git a/src/hooks/useCactusVAD.ts b/src/hooks/useCactusAudio.ts
similarity index 67%
rename from src/hooks/useCactusVAD.ts
rename to src/hooks/useCactusAudio.ts
index 4f6bdc5..30ee2c0 100644
--- a/src/hooks/useCactusVAD.ts
+++ b/src/hooks/useCactusAudio.ts
@@ -1,24 +1,28 @@
 import { useCallback, useEffect, useState, useRef } from 'react';
-import { CactusVAD } from '../classes/CactusVAD';
+import { CactusAudio } from '../classes/CactusAudio';
 import { CactusFileSystem } from '../native';
 import { getErrorMessage } from '../utils/error';
 import type {
-  CactusVADParams,
-  CactusVADDownloadParams,
-  CactusVADVadParams,
-  CactusVADResult,
-} from '../types/CactusVAD';
+  CactusAudioParams,
+  CactusAudioDownloadParams,
+  CactusAudioVADParams,
+  CactusAudioVADResult,
+  CactusAudioDiarizeParams,
+  CactusAudioDiarizeResult,
+  CactusAudioEmbedSpeakerParams,
+  CactusAudioEmbedSpeakerResult,
+} from '../types/CactusAudio';
 import type { CactusModel } from '../types/common';
 
-export const useCactusVAD = ({
+export const useCactusAudio = ({
   model = 'silero-vad',
   options: modelOptions = {
     quantization: undefined,
     pro: false,
   },
-}: CactusVADParams = {}) => {
-  const [cactusVAD, setCactusVAD] = useState(
-    () => new CactusVAD({ model, options: modelOptions })
+}: CactusAudioParams = {}) => {
+  const [cactusAudio, setCactusAudio] = useState(
+    () => new CactusAudio({ model, options: modelOptions })
   );
 
   const [isInitializing, setIsInitializing] = useState(false);
@@ -35,14 +39,14 @@ export const useCactusVAD = ({
   }, [model]);
 
   useEffect(() => {
-    const newInstance = new CactusVAD({
+    const newInstance = new CactusAudio({
       model,
       options: {
         quantization: modelOptions.quantization,
         pro: modelOptions.pro,
       },
     });
-    setCactusVAD(newInstance);
+    setCactusAudio(newInstance);
 
     setIsInitializing(false);
     setIsDownloaded(false);
@@ -73,14 +77,14 @@ export const useCactusVAD = ({
 
   useEffect(() => {
     return () => {
-      cactusVAD.destroy().catch(() => {});
+      cactusAudio.destroy().catch(() => {});
     };
-  }, [cactusVAD]);
+  }, [cactusAudio]);
 
   const download = useCallback(
-    async ({ onProgress }: CactusVADDownloadParams = {}) => {
+    async ({ onProgress }: CactusAudioDownloadParams = {}) => {
       if (isDownloading) {
-        const message = 'CactusVAD is already downloading';
+        const message = 'CactusAudio is already downloading';
         setError(message);
         throw new Error(message);
       }
@@ -97,7 +101,7 @@ export const useCactusVAD = ({
       setDownloadProgress(0);
       setIsDownloading(true);
       try {
-        await cactusVAD.download({
+        await cactusAudio.download({
           onProgress: (progress) => {
             if (
               currentModelRef.current !== thisModel ||
@@ -141,12 +145,12 @@ export const useCactusVAD = ({
         setDownloadProgress(0);
       }
     },
-    [cactusVAD, isDownloading, isDownloaded]
+    [cactusAudio, isDownloading, isDownloaded]
   );
 
   const init = useCallback(async () => {
     if (isInitializing) {
-      const message = 'CactusVAD is already initializing';
+      const message = 'CactusAudio is already initializing';
       setError(message);
       throw new Error(message);
     }
@@ -154,50 +158,81 @@ export const useCactusVAD = ({
     setError(null);
     setIsInitializing(true);
     try {
-      await cactusVAD.init();
+      await cactusAudio.init();
     } catch (e) {
       setError(getErrorMessage(e));
       throw e;
     } finally {
       setIsInitializing(false);
     }
-  }, [cactusVAD, isInitializing]);
+  }, [cactusAudio, isInitializing]);
 
   const vad = useCallback(
     async ({
       audio,
       options,
-    }: CactusVADVadParams): Promise<CactusVADResult> => {
+    }: CactusAudioVADParams): Promise<CactusAudioVADResult> => {
       setError(null);
       try {
-        return await cactusVAD.vad({ audio, options });
+        return await cactusAudio.vad({ audio, options });
       } catch (e) {
         setError(getErrorMessage(e));
         throw e;
       }
     },
-    [cactusVAD]
+    [cactusAudio]
+  );
+
+  const diarize = useCallback(
+    async ({
+      audio,
+      options,
+    }: CactusAudioDiarizeParams): Promise<CactusAudioDiarizeResult> => {
+      setError(null);
+      try {
+        return await cactusAudio.diarize({ audio, options });
+      } catch (e) {
+        setError(getErrorMessage(e));
+        throw e;
+      }
+    },
+    [cactusAudio]
+  );
+
+  const embedSpeaker = useCallback(
+    async ({
+      audio,
+    }: CactusAudioEmbedSpeakerParams): Promise<CactusAudioEmbedSpeakerResult> => {
+      setError(null);
+      try {
+        return await cactusAudio.embedSpeaker({ audio });
+      } catch (e) {
+        setError(getErrorMessage(e));
+        throw e;
+      }
+    },
+    [cactusAudio]
   );
 
   const destroy = useCallback(async () => {
     setError(null);
     try {
-      await cactusVAD.destroy();
+      await cactusAudio.destroy();
     } catch (e) {
       setError(getErrorMessage(e));
       throw e;
     }
-  }, [cactusVAD]);
+  }, [cactusAudio]);
 
   const getModels = useCallback(async (): Promise<CactusModel[]> => {
     setError(null);
     try {
-      return await cactusVAD.getModels();
+      return await cactusAudio.getModels();
     } catch (e) {
       setError(getErrorMessage(e));
       throw e;
     }
-  }, [cactusVAD]);
+  }, [cactusAudio]);
 
   return {
     isInitializing,
@@ -209,6 +244,8 @@ export const useCactusVAD = ({
     download,
     init,
     vad,
+    diarize,
+    embedSpeaker,
     destroy,
     getModels,
   };
diff --git a/src/index.tsx b/src/index.tsx
index 299705f..adb0a03 100644
--- a/src/index.tsx
+++ b/src/index.tsx
@@ -1,13 +1,13 @@
 // Classes
 export { CactusLM } from './classes/CactusLM';
 export { CactusSTT } from './classes/CactusSTT';
-export { CactusVAD } from './classes/CactusVAD';
+export { CactusAudio } from './classes/CactusAudio';
 export { CactusIndex } from './classes/CactusIndex';
 
 // Hooks
 export { useCactusLM } from './hooks/useCactusLM';
 export { useCactusSTT } from './hooks/useCactusSTT';
-export { useCactusVAD } from './hooks/useCactusVAD';
+export { useCactusAudio } from './hooks/useCactusAudio';
 export { useCactusIndex } from './hooks/useCactusIndex';
 
 // Registry
@@ -23,6 +23,8 @@ export type {
   CactusLMTool,
   CactusLMCompleteParams,
   CactusLMCompleteResult,
+  CactusLMPrefillParams,
+  CactusLMPrefillResult,
   CactusLMTokenizeParams,
   CactusLMTokenizeResult,
   CactusLMScoreWindowParams,
@@ -49,13 +51,18 @@ export type {
   CactusSTTDetectLanguageResult,
 } from './types/CactusSTT';
 export type {
-  CactusVADParams,
-  CactusVADDownloadParams,
-  CactusVADVadParams,
-  CactusVADOptions,
-  CactusVADSegment,
-  CactusVADResult,
-} from './types/CactusVAD';
+  CactusAudioParams,
+  CactusAudioDownloadParams,
+  CactusAudioVADOptions,
+  CactusAudioVADSegment,
+  CactusAudioVADParams,
+  CactusAudioVADResult,
+  CactusAudioDiarizeOptions,
+  CactusAudioDiarizeParams,
+  CactusAudioDiarizeResult,
+  CactusAudioEmbedSpeakerParams,
+  CactusAudioEmbedSpeakerResult,
+} from './types/CactusAudio';
 export type {
   CactusIndexParams,
   CactusIndexAddParams,
diff --git a/src/modelRegistry.ts b/src/modelRegistry.ts
index b72d307..f3fac09 100644
--- a/src/modelRegistry.ts
+++ b/src/modelRegistry.ts
@@ -1,6 +1,6 @@
 import type { CactusModel } from './types/common';
 
-const RUNTIME_VERSION = '1.10.4';
+const RUNTIME_VERSION = '1.12.0';
 
 let registryPromise: Promise<{ [key: string]: CactusModel }> | null = null;
 
diff --git a/src/native/Cactus.ts b/src/native/Cactus.ts
index 4608b12..1a667ee 100644
--- a/src/native/Cactus.ts
+++ b/src/native/Cactus.ts
@@ -3,6 +3,7 @@ import type { Cactus as CactusSpec } from '../specs/Cactus.nitro';
 import { CactusImage } from './CactusImage';
 import type {
   CactusLMCompleteResult,
+  CactusLMPrefillResult,
   CactusLMMessage,
   CactusLMCompleteOptions,
   CactusLMTool,
@@ -16,7 +17,13 @@ import type {
   CactusSTTDetectLanguageOptions,
   CactusSTTDetectLanguageResult,
 } from '../types/CactusSTT';
-import type { CactusVADOptions, CactusVADResult } from '../types/CactusVAD';
+import type {
+  CactusAudioVADOptions,
+  CactusAudioVADResult,
+  CactusAudioDiarizeOptions,
+  CactusAudioDiarizeResult,
+  CactusAudioEmbedSpeakerResult,
+} from '../types/CactusAudio';
 
 export class Cactus {
   private readonly hybridCactus =
@@ -72,6 +79,7 @@ export class Cactus {
           tool_rag_top_k: options.toolRagTopK,
           include_stop_sequences: options.includeStopSequences,
           use_vad: options.useVad,
+          enable_thinking_if_supported: options.enableThinking,
         })
       : undefined;
     const toolsJson = JSON.stringify(tools);
@@ -90,6 +98,7 @@ export class Cactus {
       return {
         success: parsed.success,
         response: parsed.response,
+        thinking: parsed.thinking,
         functionCalls: parsed.function_calls,
         cloudHandoff: parsed.cloud_handoff,
         confidence: parsed.confidence,
@@ -107,6 +116,54 @@ export class Cactus {
     }
   }
 
+  public async prefill(
+    messages: CactusLMMessage[],
+    responseBufferSize: number,
+    options?: CactusLMCompleteOptions,
+    tools?: { type: 'function'; function: CactusLMTool }[]
+  ): Promise<CactusLMPrefillResult> {
+    const messagesJson = JSON.stringify(messages);
+    const optionsJson = options
+      ? JSON.stringify({
+          temperature: options.temperature,
+          top_p: options.topP,
+          top_k: options.topK,
+          max_tokens: options.maxTokens,
+          stop_sequences: options.stopSequences,
+          force_tools: options.forceTools,
+          telemetry_enabled: options.telemetryEnabled,
+          confidence_threshold: options.confidenceThreshold,
+          tool_rag_top_k: options.toolRagTopK,
+          include_stop_sequences: options.includeStopSequences,
+          use_vad: options.useVad,
+          enable_thinking_if_supported: options.enableThinking,
+        })
+      : undefined;
+    const toolsJson = JSON.stringify(tools);
+
+    const response = await this.hybridCactus.prefill(
+      messagesJson,
+      responseBufferSize,
+      optionsJson,
+      toolsJson
+    );
+
+    try {
+      const parsed = JSON.parse(response);
+
+      return {
+        success: parsed.success,
+        error: parsed.error,
+        prefillTokens: parsed.prefill_tokens,
+        prefillTps: parsed.prefill_tps,
+        totalTimeMs: parsed.total_time_ms,
+        ramUsageMb: parsed.ram_usage_mb,
+      };
+    } catch {
+      throw new Error('Unable to parse prefill response');
+    }
+  }
+
   public tokenize(text: string): Promise<number[]> {
     return this.hybridCactus.tokenize(text);
   }
@@ -195,6 +252,7 @@ export class Cactus {
           confirmation_threshold: options.confirmationThreshold,
           min_chunk_size: options.minChunkSize,
           telemetry_enabled: options.telemetryEnabled,
+          language: options.language,
         })
       : undefined;
     return this.hybridCactus.streamTranscribeStart(optionsJson);
@@ -272,8 +330,8 @@ export class Cactus {
 
   public async vad(
     audio: string | number[],
-    options?: CactusVADOptions
-  ): Promise<CactusVADResult> {
+    options?: CactusAudioVADOptions
+  ): Promise<CactusAudioVADResult> {
     if (typeof audio === 'string') {
       audio = audio.replace('file://', '');
     }
@@ -338,6 +396,63 @@ export class Cactus {
     );
   }
 
+  public async diarize(
+    audio: string | number[],
+    options?: CactusAudioDiarizeOptions
+  ): Promise<CactusAudioDiarizeResult> {
+    if (typeof audio === 'string') {
+      audio = audio.replace('file://', '');
+    }
+    const optionsJson = options
+      ? JSON.stringify({
+          step_ms: options.stepMs,
+          threshold: options.threshold,
+          num_speakers: options.numSpeakers,
+          min_speakers: options.minSpeakers,
+          max_speakers: options.maxSpeakers,
+        })
+      : undefined;
+    const response = await this.hybridCactus.diarize(
+      audio,
+      2 * 1024 * 1024,
+      optionsJson
+    );
+    try {
+      const parsed = JSON.parse(response);
+      return {
+        success: parsed.success,
+        error: parsed.error,
+        numSpeakers: parsed.num_speakers,
+        scores: parsed.scores,
+        totalTimeMs: parsed.total_time_ms,
+        ramUsageMb: parsed.ram_usage_mb,
+      };
+    } catch {
+      throw new Error('Unable to parse diarize response');
+    }
+  }
+
+  public async embedSpeaker(
+    audio: string | number[]
+  ): Promise<CactusAudioEmbedSpeakerResult> {
+    if (typeof audio === 'string') {
+      audio = audio.replace('file://', '');
+    }
+    const response = await this.hybridCactus.embedSpeaker(audio, 65536);
+    try {
+      const parsed = JSON.parse(response);
+      return {
+        success: parsed.success,
+        error: parsed.error,
+        embedding: parsed.embedding,
+        totalTimeMs: parsed.total_time_ms,
+        ramUsageMb: parsed.ram_usage_mb,
+      };
+    } catch {
+      throw new Error('Unable to parse embed speaker response');
+    }
+  }
+
   public reset(): Promise<void> {
     return this.hybridCactus.reset();
   }
diff --git a/src/specs/Cactus.nitro.ts b/src/specs/Cactus.nitro.ts
index ea5daa6..d567708 100644
--- a/src/specs/Cactus.nitro.ts
+++ b/src/specs/Cactus.nitro.ts
@@ -13,6 +13,12 @@ export interface Cactus extends HybridObject<{ ios: 'c++'; android: 'c++' }> {
     toolsJson?: string,
     callback?: (token: string, tokenId: number) => void
   ): Promise<string>;
+  prefill(
+    messagesJson: string,
+    responseBufferSize: number,
+    optionsJson?: string,
+    toolsJson?: string
+  ): Promise<string>;
   tokenize(text: string): Promise<number[]>;
   scoreWindow(
     tokens: number[],
@@ -47,6 +53,16 @@ export interface Cactus extends HybridObject<{ ios: 'c++'; android: 'c++' }> {
   ): Promise<number[]>;
   imageEmbed(imagePath: string, embeddingBufferSize: number): Promise<number[]>;
   audioEmbed(audioPath: string, embeddingBufferSize: number): Promise<number[]>;
+  diarize(
+    audio: string | number[],
+    responseBufferSize: number,
+    optionsJson?: string
+  ): Promise<string>;
+  embedSpeaker(
+    audio: string | number[],
+    responseBufferSize: number,
+    optionsJson?: string
+  ): Promise<string>;
   reset(): Promise<void>;
   stop(): Promise<void>;
   destroy(): Promise<void>;
diff --git a/src/types/CactusAudio.ts b/src/types/CactusAudio.ts
new file mode 100644
index 0000000..12f0c5a
--- /dev/null
+++ b/src/types/CactusAudio.ts
@@ -0,0 +1,73 @@
+import type { CactusModelOptions } from './common';
+
+export interface CactusAudioParams {
+  model?: string;
+  options?: CactusModelOptions;
+}
+
+export interface CactusAudioDownloadParams {
+  onProgress?: (progress: number) => void;
+}
+
+export interface CactusAudioVADOptions {
+  threshold?: number;
+  negThreshold?: number;
+  minSpeechDurationMs?: number;
+  maxSpeechDurationS?: number;
+  minSilenceDurationMs?: number;
+  speechPadMs?: number;
+  windowSizeSamples?: number;
+  samplingRate?: number;
+  minSilenceAtMaxSpeech?: number;
+  useMaxPossSilAtMaxSpeech?: boolean;
+}
+
+export interface CactusAudioVADSegment {
+  start: number;
+  end: number;
+}
+
+export interface CactusAudioVADParams {
+  audio: string | number[];
+  options?: CactusAudioVADOptions;
+}
+
+export interface CactusAudioVADResult {
+  segments: CactusAudioVADSegment[];
+  totalTime: number;
+  ramUsage: number;
+}
+
+export interface CactusAudioDiarizeOptions {
+  stepMs?: number;
+  threshold?: number;
+  numSpeakers?: number;
+  minSpeakers?: number;
+  maxSpeakers?: number;
+}
+
+export interface CactusAudioDiarizeParams {
+  audio: string | number[];
+  options?: CactusAudioDiarizeOptions;
+}
+
+export interface CactusAudioDiarizeResult {
+  success: boolean;
+  error: string | null;
+  numSpeakers: number;
+  scores: number[];
+  totalTimeMs: number;
+  ramUsageMb: number;
+}
+
+export interface CactusAudioEmbedSpeakerParams {
+  audio: string | number[];
+}
+
+export interface CactusAudioEmbedSpeakerResult {
+  success: boolean;
+  error: string | null;
+  embedding: number[];
+  totalTimeMs: number;
+  ramUsageMb: number;
+}
diff --git a/src/types/CactusLM.ts b/src/types/CactusLM.ts
index a687621..14f53be 100644
--- a/src/types/CactusLM.ts
+++ b/src/types/CactusLM.ts
@@ -29,6 +29,7 @@ export interface CactusLMCompleteOptions {
   toolRagTopK?: number;
   includeStopSequences?: boolean;
   useVad?: boolean;
+  enableThinking?: boolean;
 }
 
 export interface CactusLMTool {
@@ -53,9 +54,25 @@ export interface CactusLMCompleteParams {
   onToken?: (token: string) => void;
 }
 
+export interface CactusLMPrefillParams {
+  messages: CactusLMMessage[];
+  options?: CactusLMCompleteOptions;
+  tools?: CactusLMTool[];
+}
+
+export interface CactusLMPrefillResult {
+  success: boolean;
+  error: string | null;
+  prefillTokens: number;
+  prefillTps: number;
+  totalTimeMs: number;
+  ramUsageMb: number;
+}
+
 export interface CactusLMCompleteResult {
   success: boolean;
   response: string;
+  thinking?: string;
   functionCalls?: {
     name: string;
     arguments: { [key: string]: any };
diff --git a/src/types/CactusSTT.ts b/src/types/CactusSTT.ts
index 6686045..db737b0 100644
--- a/src/types/CactusSTT.ts
+++ b/src/types/CactusSTT.ts
@@ -56,6 +56,7 @@ export interface CactusSTTStreamTranscribeStartOptions {
   confirmationThreshold?: number;
   minChunkSize?: number;
   telemetryEnabled?: boolean;
+  language?: string;
 }
 
 export interface CactusSTTStreamTranscribeProcessParams {
diff --git a/src/types/CactusVAD.ts b/src/types/CactusVAD.ts
deleted file mode 100644
index f303e34..0000000
--- a/src/types/CactusVAD.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-import type { CactusModelOptions } from './common';
-
-export interface CactusVADParams {
-  model?: string;
-  options?: CactusModelOptions;
-}
-
-export interface CactusVADDownloadParams {
-  onProgress?: (progress: number) => void;
-}
-
-export interface CactusVADOptions {
-  threshold?: number;
-  negThreshold?: number;
-  minSpeechDurationMs?: number;
-  maxSpeechDurationS?: number;
-  minSilenceDurationMs?: number;
-  speechPadMs?: number;
-  windowSizeSamples?: number;
-  samplingRate?: number;
-  minSilenceAtMaxSpeech?: number;
-  useMaxPossSilAtMaxSpeech?: boolean;
-}
-
-export interface CactusVADSegment {
-  start: number;
-  end: number;
-}
-
-export interface CactusVADResult {
-  segments: CactusVADSegment[];
-  totalTime: number;
-  ramUsage: number;
-}
-
-export interface CactusVADVadParams {
-  audio: string | number[];
-  options?: CactusVADOptions;
-}