abetlen
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 0 additions & 2 deletions b/‎Makefile‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎llama_cpp/_internals.py‎
Lines changed: 6 additions & 2 deletions b/‎llama_cpp/_internals.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎llama_cpp/llama.py‎
Lines changed: 14 additions & 8 deletions b/‎llama_cpp/llama.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎llama_cpp/llama_chat_format.py‎
Lines changed: 8 additions & 1 deletion b/‎llama_cpp/llama_chat_format.py‎
Lines changed: 8 additions & 1 deletion
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
 
@@ -153,6 +153,26 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's
+        # top-level CMakeLists.txt. When we include tools/mtmd directly from the
+        # Python package build, that directory scope is skipped.
+        if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "")
+            set(LLAMA_INSTALL_VERSION 0.0.0)
+            find_package(Git QUIET)
+            if (Git_FOUND)
+                execute_process(
+                    COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
+                    OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER
+                    OUTPUT_STRIP_TRAILING_WHITESPACE
+                    RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT
+                )
+                if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0)
+                    set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER})
+                endif()
+            endif()
+        endif()
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
 
@@ -82,8 +82,6 @@ run-server:
 	python3 -m llama_cpp.server --model ${MODEL}
 
 clean:
-	- cd vendor/llama.cpp && make clean
-	- cd vendor/llama.cpp && rm libllama.so
 	- rm -rf _skbuild
 	- rm llama_cpp/lib/*.so
 	- rm llama_cpp/lib/*.dylib
 
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+import warnings
 
 from typing import (
     Dict,
@@ -699,8 +700,11 @@ def add_dist(self, seed: int):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+        warnings.warn(
+            "add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
 
@@ -341,7 +341,11 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        self.context_params.flash_attn_type = (
+            llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            if flash_attn
+            else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+        )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -431,9 +435,9 @@ def free_lora_adapter():
 
             self._stack.callback(free_lora_adapter)
 
-            if llama_cpp.llama_set_adapter_lora(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale
-            ):
+            adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter)
+            scales = (ctypes.c_float * 1)(self.lora_scale)
+            if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales):
                 raise RuntimeError(
                     f"Failed to set LoRA adapter from lora path: {self.lora_path}"
                 )
@@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_grammar(self._model, grammar)
 
         if temp < 0.0:
-            sampler.add_softmax()
             sampler.add_dist(self._seed)
         elif temp == 0.0:
             sampler.add_greedy()
@@ -1042,7 +1045,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            self._ctx.kv_cache_clear()
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1113,7 +1116,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        self._ctx.kv_cache_clear()
         self.reset()
 
         if return_count:
@@ -2100,7 +2103,10 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=(
+                self.context_params.flash_attn_type
+                == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            ),
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
 
@@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
             ctx_params.use_gpu = True  # TODO: Make this configurable
             ctx_params.print_timings = self.verbose
             ctx_params.n_threads = llama_model.n_threads
-            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+            ctx_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                if (
+                    llama_model.context_params.flash_attn_type
+                    == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                )
+                else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(