diff --git a/CHANGELOG.md b/CHANGELOG.md
index 53365e368..e08e52c10 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.10]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8846aace4934ad29651ea61b8c7e3f6b0556e3d2
+- feat: Add support for llama.cpp multimodal, add Qwen2.5-VL chat handler by @abetlen in cd548bd0f14210627798237d5c2ea78acfb88ccb
+
+## [0.3.9]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c
+
+## [0.3.8]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698
+
 ## [0.3.7]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@794fe23f29fb40104975c91fe19f23798f7c726e
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64a0304a1..4b06d98b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,9 @@ if (LLAMA_BUILD)
     # Enable building of the common library
     set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
 
+    # Disable building curl support
+    set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)
+
     # Architecture detection and settings for Apple platforms
     if (APPLE)
         # Get the target architecture
@@ -93,7 +96,15 @@ if (LLAMA_BUILD)
         set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE)
     endif()
 
+
     add_subdirectory(vendor/llama.cpp)
+
+    if (WIN32)
+        if (TARGET llama)
+            set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+        endif()
+    endif()
+
     llama_cpp_python_install_target(llama)
     llama_cpp_python_install_target(ggml)
 
@@ -143,35 +154,34 @@ if (LLAMA_BUILD)
         endif()
 
         # Building llava
-        add_subdirectory(vendor/llama.cpp/examples/llava)
-        set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+        add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)
-            set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+            set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
-        llama_cpp_python_install_target(llava_shared)
+        llama_cpp_python_install_target(mtmd)
         if (WIN32)
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
             )
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
             )
         endif()
 
-        # Fix for llava build: Add include directory for llama.h
+        # Fix for mtmd build: Add include directory for llama.h
         # Move these commands after the add_subdirectory call
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
 
         if (BUILD_SHARED_LIBS)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+            target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+            target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
         endif()
 
-        target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        # target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        # target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
     endif()
 endif()
diff --git a/README.md b/README.md
index e00456580..088a23779 100644
--- a/README.md
+++ b/README.md
@@ -505,6 +505,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
+| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index fc1fcbcf6..11a511390 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.7"
+__version__ = "0.3.10"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 343581dce..18d733481 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -9,6 +9,8 @@
     Tuple,
     Optional,
     Sequence,
+    Callable,
+    Union,
 )
 from dataclasses import dataclass, field
 from contextlib import ExitStack
@@ -48,7 +50,7 @@ def __init__(
             raise ValueError(f"Model path does not exist: {path_model}")
 
         with suppress_stdout_stderr(disable=verbose):
-            model = llama_cpp.llama_load_model_from_file(
+            model = llama_cpp.llama_model_load_from_file(
                 self.path_model.encode("utf-8"), self.params
             )
 
@@ -62,32 +64,38 @@ def __init__(
 
         self.model = model
         self.vocab = vocab
+        self.sampler = None  # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
 
         def free_model():
             if self.model is None:
                 return
-            llama_cpp.llama_free_model(self.model)
+            llama_cpp.llama_model_free(self.model)
             self.model = None
 
         self._exit_stack.callback(free_model)
 
     def close(self):
+        if self.sampler is not None:
+            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+            for i, _ in reversed(self.custom_samplers):
+                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+            self.custom_samplers.clear()
         self._exit_stack.close()
 
     def __del__(self):
         self.close()
 
     def vocab_type(self) -> int:
-        return llama_cpp.llama_vocab_type(self.model)
+        return llama_cpp.llama_vocab_type(self.vocab)
 
     def n_vocab(self) -> int:
-        return llama_cpp.llama_n_vocab(self.vocab)
+        return llama_cpp.llama_vocab_n_tokens(self.vocab)
 
     def n_ctx_train(self) -> int:
-        return llama_cpp.llama_n_ctx_train(self.model)
+        return llama_cpp.llama_model_n_ctx_train(self.model)
 
     def n_embd(self) -> int:
-        return llama_cpp.llama_n_embd(self.model)
+        return llama_cpp.llama_model_n_embd(self.model)
 
     def rope_freq_scale_train(self) -> float:
         return llama_cpp.llama_model_rope_freq_scale_train(self.model)
@@ -109,48 +117,48 @@ def get_tensor(self, name: str) -> ctypes.c_void_p:
     # Vocab
 
     def token_get_text(self, token: int) -> str:
-        return llama_cpp.llama_token_get_text(self.vocab, token).decode("utf-8")
+        return llama_cpp.llama_vocab_get_text(self.vocab, token).decode("utf-8")
 
     def token_get_score(self, token: int) -> float:
-        return llama_cpp.llama_token_get_score(self.vocab, token)
+        return llama_cpp.llama_vocab_get_score(self.vocab, token)
 
     def token_get_attr(self, token: int) -> int:
-        return llama_cpp.llama_token_get_attr(self.vocab, token)
+        return llama_cpp.llama_vocab_get_attr(self.vocab, token)
 
     # Special tokens
 
     def token_bos(self) -> int:
-        return llama_cpp.llama_token_bos(self.vocab)
+        return llama_cpp.llama_vocab_bos(self.vocab)
 
     def token_eos(self) -> int:
-        return llama_cpp.llama_token_eos(self.vocab)
+        return llama_cpp.llama_vocab_eos(self.vocab)
 
     def token_cls(self) -> int:
-        return llama_cpp.llama_token_cls(self.vocab)
+        return llama_cpp.llama_vocab_cls(self.vocab)
 
     def token_sep(self) -> int:
-        return llama_cpp.llama_token_sep(self.vocab)
+        return llama_cpp.llama_vocab_sep(self.vocab)
 
     def token_nl(self) -> int:
-        return llama_cpp.llama_token_nl(self.vocab)
+        return llama_cpp.llama_vocab_nl(self.vocab)
 
     def token_prefix(self) -> int:
-        raise NotImplementedError("token_prefix is not implemented in llama.cpp")
+        return llama_cpp.llama_vocab_fim_pre(self.vocab)
 
     def token_middle(self) -> int:
-        raise NotImplementedError("token_middle is not implemented in llama.cpp")
+        return llama_cpp.llama_vocab_fim_mid(self.vocab)
 
     def token_suffix(self) -> int:
-        raise NotImplementedError("token_suffix is not implemented in llama.cpp")
+        return llama_cpp.llama_vocab_fim_suf(self.vocab)
 
     def token_eot(self) -> int:
-        return llama_cpp.llama_token_eot(self.vocab)
+        return llama_cpp.llama_vocab_eot(self.vocab)
 
     def add_bos_token(self) -> bool:
-        return llama_cpp.llama_add_bos_token(self.vocab)
+        return llama_cpp.llama_vocab_get_add_bos(self.vocab)
 
     def add_eos_token(self) -> bool:
-        return llama_cpp.llama_add_eos_token(self.vocab)
+        return llama_cpp.llama_vocab_get_add_eos(self.vocab)
 
     # Tokenization
 
@@ -249,12 +257,14 @@ def __init__(
         self.verbose = verbose
         self._exit_stack = ExitStack()
 
-        ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params)
+        ctx = llama_cpp.llama_init_from_model(self.model.model, self.params)
 
         if ctx is None:
             raise ValueError("Failed to create llama_context")
 
         self.ctx = ctx
+        self.memory = llama_cpp.llama_get_memory(self.ctx)
+        self.sampler = None  # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
 
         def free_ctx():
             if self.ctx is None:
@@ -277,22 +287,22 @@ def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
     def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+        llama_cpp.llama_memory_clear(self.memory, True)
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+        llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+        llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+        llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
-        llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
+        llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:
-        return llama_cpp.llama_get_state_size(self.ctx)
+        return llama_cpp.llama_state_get_size(self.ctx)
 
     # TODO: copy_state_data
 
@@ -310,6 +320,14 @@ def decode(self, batch: LlamaBatch):
         if return_code != 0:
             raise RuntimeError(f"llama_decode returned {return_code}")
 
+    def encode(self, batch: LlamaBatch):
+        return_code = llama_cpp.llama_encode(
+            self.ctx,
+            batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_encode returned {return_code}")
+
     def set_n_threads(self, n_threads: int, n_threads_batch: int):
         llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
 
@@ -322,12 +340,16 @@ def get_logits_ith(self, i: int):
     def get_embeddings(self):
         return llama_cpp.llama_get_embeddings(self.ctx)
 
-    # Sampling functions
+    def get_embeddings_ith(self, i: int):
+        return llama_cpp.llama_get_embeddings_ith(self.ctx, i)
+
+    def get_embeddings_seq(self, seq_id: int):
+        return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id)
+
+    # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        # TODO: Fix
-        # llama_cpp.llama_set_rng_seed(self.ctx, seed)
-        raise NotImplementedError("set_rng_seed is not implemented in llama.cpp")
+        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
 
     def sample_repetition_penalties(
         self,
@@ -338,63 +360,30 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        # llama_cpp.llama_sample_repetition_penalties(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        #     last_tokens_data,
-        #     penalty_last_n,
-        #     penalty_repeat,
-        #     penalty_freq,
-        #     penalty_present,
-        # )
-        raise NotImplementedError("sample_repetition_penalties is not implemented in llama.cpp")
+        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        # llama_cpp.llama_sample_softmax(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        # )
-        raise NotImplementedError("sample_softmax is not implemented in llama.cpp")
+        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        # llama_cpp.llama_sample_top_k(
-        #     self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
-        # )
-        raise NotImplementedError("sample_top_k is not implemented in llama.cpp")
+        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        # llama_cpp.llama_sample_top_p(
-        #     self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        # )
-        raise NotImplementedError("sample_top_p is not implemented in llama.cpp")
+        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        # llama_cpp.llama_sample_min_p(
-        #     self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        # )
-        raise NotImplementedError("sample_min_p is not implemented in llama.cpp")
+        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        # llama_cpp.llama_sample_typical(
-        #     self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        # )
-        raise NotImplementedError("sample_typical is not implemented in llama.cpp")
+        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
-        # llama_cpp.llama_sample_temp(
-        #     self.ctx, llama_cpp.byref(candidates.candidates), temp
-        # )
-        raise NotImplementedError("sample_temp is not implemented in llama.cpp")
+        raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        # llama_cpp.llama_sample_grammar(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        #     grammar.grammar,
-        # )
-        raise NotImplementedError("sample_grammar is not implemented in llama.cpp")
+        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat(
         self,
@@ -404,15 +393,7 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is not implemented in llama.cpp")
-        # return llama_cpp.llama_sample_token_mirostat(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        #     tau,
-        #     eta,
-        #     m,
-        #     mu,
-        # )
+        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat_v2(
         self,
@@ -421,33 +402,17 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is not implemented in llama.cpp")
-        # return llama_cpp.llama_sample_token_mirostat_v2(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        #     tau,
-        #     eta,
-        #     mu,
-        # )
+        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is not implemented in llama.cpp")
-        # return llama_cpp.llama_sample_token_greedy(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        # )
+        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is not implemented in llama.cpp")
-        # return llama_cpp.llama_sample_token(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        # )
+        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is not implemented in llama.cpp")
-        # llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)
+        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -478,6 +443,7 @@ def __init__(
             raise ValueError("Failed to create llama_batch")
 
         self.batch = batch
+        self.sampler = None  # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
 
         def free_batch():
             if self.batch is None:
@@ -540,6 +506,7 @@ def __init__(self, *, n_vocab: int):
         )
         self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)  # type: ignore
         self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
+        self.sampler = None  # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
 
     def copy_logits(self, logits: npt.NDArray[np.single]):
         self.candidates_data.id[:] = self.default_candidates_data_id
@@ -628,103 +595,16 @@ def sample(
         idx: int = 0,
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
-        n_vocab = ctx_main.model.n_vocab()
-        id: int = 0
-
-        if logits_array is None:
-            logits = ctx_main.get_logits_ith(idx)
-            logits_array = np.array(
-                ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents,
-                dtype=np.single,
-            )
-
-        # apply logit_bias
-        for token, logit_bias in self.params.logit_bias.items():
-            logits_array[token] += logit_bias
-
-        token_data_array = LlamaTokenDataArray(
-            n_vocab=n_vocab
-        )  # TODO: Only create this once
-        token_data_array.copy_logits(logits_array)
-
-        # apply penalties
-        if len(self.prev) > 0:
-            nl_token = ctx_main.model.token_nl()
-            nl_logit = logits_array[nl_token]
-            last_tokens = self.prev[-self.params.penalty_last_n :]
-            last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
-            if last_tokens_size > 0:
-                last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
-                ctx_main.sample_repetition_penalties(
-                    token_data_array,
-                    last_tokens_p,
-                    last_tokens_size,
-                    self.params.penalty_repeat,
-                    self.params.penalty_freq,
-                    self.params.penalty_present,
-                )
-            if not self.params.penalize_nl:
-                token_data_array.candidates_data.logit[nl_token] = nl_logit
-
-        if self.grammar is not None:
-            ctx_main.sample_grammar(token_data_array, self.grammar)
-
-        if self.params.temp < 0:
-            ctx_main.sample_softmax(token_data_array)
-            id = token_data_array.candidates_data.id[0]
-        elif self.params.temp == 0:
-            id = ctx_main.sample_token_greedy(token_data_array)
-        else:
-            if self.params.mirostat == 1:
-                mirostat_m = 100
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    mirostat_m,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            elif self.params.mirostat == 2:
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat_v2(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            else:
-                min_keep = max(1, self.params.n_probs)
-                ctx_main.sample_top_k(
-                    token_data_array, self.params.top_k, min_keep=min_keep
-                )
-                ctx_main.sample_typical(
-                    token_data_array, self.params.typical_p, min_keep=min_keep
-                )
-                ctx_main.sample_top_p(
-                    token_data_array, self.params.top_p, min_keep=min_keep
-                )
-                ctx_main.sample_min_p(
-                    token_data_array, self.params.min_p, min_keep=min_keep
-                )
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token(token_data_array)
-        return id
+        # This method is deprecated in favor of using LlamaSampler directly
+        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
-        if apply_grammar and self.grammar is not None:
-            ctx_main.grammar_accept_token(self.grammar, id)
         self.prev.append(id)
 
 
-from typing import List, Callable, Optional, Union
-import ctypes
-import llama_cpp
-
-
 class CustomSampler:
     def __init__(
-        self, apply_func: typing.Callable[[llama_cpp.llama_token_data_array], None]
+        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
     ):
         self.apply_func = apply_func
 
@@ -757,72 +637,117 @@ def get_sampler(self) -> llama_cpp.llama_sampler_p:
 
 class LlamaSampler:
     def __init__(self):
-        params = llama_cpp.llama_sampler_chain_params()
+        params = llama_cpp.llama_sampler_chain_default_params()
         self.sampler = llama_cpp.llama_sampler_chain_init(params)
-        self.samplers: List[llama_cpp.llama_sampler_p] = []
         self.custom_samplers: List[Tuple[int, CustomSampler]] = []
+        self._exit_stack = ExitStack()
+
+        def free_sampler():
+            if self.sampler is not None:
+                # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+                for i, _ in reversed(self.custom_samplers):
+                    llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+                llama_cpp.llama_sampler_free(self.sampler)
+                self.sampler = None
+
+        self._exit_stack.callback(free_sampler)
+
+    def close(self):
+        self._exit_stack.close()
+
+    def __del__(self):
+        self.close()
 
     def add_greedy(self):
         sampler = llama_cpp.llama_sampler_init_greedy()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_dist(self, seed: int):
         sampler = llama_cpp.llama_sampler_init_dist(seed)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
         sampler = llama_cpp.llama_sampler_init_softmax()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_top_p(self, p: float, min_keep: int):
+    def add_top_p(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_min_p(self, p: float, min_keep: int):
+    def add_min_p(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_typical(self, p: float, min_keep: int):
+    def add_typical(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_temp(self, temp: float):
         sampler = llama_cpp.llama_sampler_init_temp(temp)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_temp_ext(self, t: float, delta: float, exponent: float):
         sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_xtc(self, p: float, t: float, min_keep: int, seed: int):
+        sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_top_n_sigma(self, n: float):
+        sampler = llama_cpp.llama_sampler_init_top_n_sigma(n)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int):
         sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat_v2(self, seed: int, tau: float, eta: float):
         sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         sampler = llama_cpp.llama_sampler_init_grammar(
             model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_grammar_lazy_patterns(
+        self, 
+        model: LlamaModel, 
+        grammar: LlamaGrammar,
+        trigger_patterns: List[str],
+        trigger_tokens: List[int]
+    ):
+        # Convert patterns to C array
+        pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
+        for i, pattern in enumerate(trigger_patterns):
+            pattern_ptrs[i] = pattern.encode("utf-8")
+        
+        # Convert tokens to C array
+        token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
+        
+        sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
+            model.vocab,
+            grammar._grammar.encode("utf-8"),
+            grammar._root.encode("utf-8"),
+            pattern_ptrs,
+            len(trigger_patterns),
+            token_array,
+            len(trigger_tokens)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_penalties(
         self,
-        n_vocab: int,
-        special_eos_id: int,
-        linefeed_id: int,
         penalty_last_n: int,
         penalty_repeat: float,
         penalty_freq: float,
         penalty_present: float,
-        penalize_nl: bool,
-        ignore_eos: bool,
     ):
         sampler = llama_cpp.llama_sampler_init_penalties(
             penalty_last_n,
@@ -830,50 +755,96 @@ def add_penalties(
             penalty_freq,
             penalty_present,
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def init_logit_bias(
-        self, n_vocab: int, n_logit_bias, logit_bias: llama_cpp.llama_logit_bias_p
+    def add_dry(
+        self,
+        model: LlamaModel,
+        n_ctx_train: int,
+        dry_multiplier: float,
+        dry_base: float,
+        dry_allowed_length: int,
+        dry_penalty_last_n: int,
+        seq_breakers: List[str]
+    ):
+        # Convert seq_breakers to C array
+        breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
+        for i, breaker in enumerate(seq_breakers):
+            breaker_ptrs[i] = breaker.encode("utf-8")
+        
+        sampler = llama_cpp.llama_sampler_init_dry(
+            model.vocab,
+            n_ctx_train,
+            dry_multiplier,
+            dry_base,
+            dry_allowed_length,
+            dry_penalty_last_n,
+            breaker_ptrs,
+            len(seq_breakers)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_logit_bias(
+        self, 
+        n_vocab: int, 
+        logit_bias: Dict[int, float]
     ):
+        # Convert logit_bias dict to C array
+        bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
+        for i, (token, bias) in enumerate(logit_bias.items()):
+            bias_array[i].token = token
+            bias_array[i].bias = bias
+        
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab, n_logit_bias, logit_bias
+            n_vocab,
+            len(logit_bias),
+            bias_array
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_infill(self, model: LlamaModel):
+        sampler = llama_cpp.llama_sampler_init_infill(model.vocab)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_custom(
         self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
     ):
         custom_sampler = CustomSampler(apply_func)
         sampler = custom_sampler.get_sampler()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
         # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
         self.custom_samplers.append(
             (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler)
         )
 
-    def _add_sampler(self, sampler: llama_cpp.llama_sampler_p):
-        assert self.sampler is not None
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
-        self.samplers.append(sampler)
-
     def get_seed(self) -> int:
-        assert self.sampler is not None
         return llama_cpp.llama_sampler_get_seed(self.sampler)
 
-    def sample(self, ctx: LlamaContext, idx: int) -> int:
-        assert self.sampler is not None
-        assert ctx.ctx is not None
+    def sample(self, ctx: LlamaContext, idx: int = -1) -> int:
         return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)
 
-    def close(self):
-        if self.sampler:
-            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
-            for i, _ in reversed(self.custom_samplers):
-                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
-            llama_cpp.llama_sampler_free(self.sampler)
-            self.sampler = None
-        self.samplers.clear()
-        self.custom_samplers.clear()
+    def accept(self, token: int):
+        llama_cpp.llama_sampler_accept(self.sampler, token)
 
-    def __del__(self):
-        self.close()
+    def reset(self):
+        llama_cpp.llama_sampler_reset(self.sampler)
+
+    def clone(self):
+        # NOTE: Custom samplers cannot be cloned due to Python callback limitations
+        if self.custom_samplers:
+            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
+        
+        cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
+        # Create a new wrapper around the cloned sampler
+        new_sampler = LlamaSampler.__new__(LlamaSampler)
+        new_sampler.sampler = cloned_sampler
+        new_sampler.custom_samplers = []
+        new_sampler._exit_stack = ExitStack()
+        
+        def free_sampler():
+            if new_sampler.sampler is not None:
+                llama_cpp.llama_sampler_free(new_sampler.sampler)
+                new_sampler.sampler = None
+
+        new_sampler._exit_stack.callback(free_sampler)
+        return new_sampler
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7e9a6af23..cdc05c7ad 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -66,7 +66,6 @@ def __init__(
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
-        rpc_servers: Optional[str] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
@@ -93,6 +92,8 @@ def __init__(
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
+        op_offloat: Optional[bool] = None,
+        swa_full: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -150,7 +151,6 @@ def __init__(
             split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
             main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
-            rpc_servers: Comma separated list of RPC servers to use for offloading
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
@@ -174,6 +174,8 @@ def __init__(
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
+            op_offloat: offload host tensor operations to device
+            swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -226,11 +228,6 @@ def __init__(
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
-        if rpc_servers is not None:
-            self.model_params.rpc_servers = rpc_servers.encode("utf-8")
-            self._rpc_servers = rpc_servers
-        else:
-            self._rpc_servers = None
         self.tensor_split = tensor_split
         self._c_tensor_split = None
         if self.tensor_split is not None:
@@ -341,12 +338,17 @@ def __init__(
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.logits_all = (
-            logits_all if draft_model is None else True
-        )  # Must be set to True for speculative decoding
+        self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
+
+        if op_offloat is not None:
+            self.context_params.op_offloat = op_offloat
+
+        if swa_full is not None:
+            self.context_params.swa_full = swa_full
+
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
@@ -568,7 +570,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen=self._n_ctx if self.context_params.logits_all else 1,
+            maxlen=self._n_ctx if self._logits_all else 1,
         )
 
     def tokenize(
@@ -641,13 +643,13 @@ def eval(self, tokens: Sequence[int]):
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
+                batch=batch, n_past=n_past, logits_all=self._logits_all
             )
             self._ctx.decode(self._batch)
             # Save tokens
             self.input_ids[n_past : n_past + n_tokens] = batch
             # Save logits
-            if self.context_params.logits_all:
+            if self._logits_all:
                 rows = n_tokens
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
@@ -709,15 +711,15 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_custom(apply_func)
 
         sampler.add_penalties(
-            n_vocab=self._n_vocab,
-            special_eos_id=self._token_eos,
-            linefeed_id=self._token_nl,
+            # n_vocab=self._n_vocab,
+            # special_eos_id=self._token_eos,
+            # linefeed_id=self._token_nl,
             penalty_last_n=self.last_n_tokens_size,
             penalty_repeat=repeat_penalty,
             penalty_freq=frequency_penalty,
             penalty_present=presence_penalty,
-            penalize_nl=penalize_nl,
-            ignore_eos=False,
+            # penalize_nl=penalize_nl,
+            # ignore_eos=False,
         )
 
         if grammar is not None:
@@ -1288,7 +1290,7 @@ def logit_bias_processor(
         else:
             stop_sequences = []
 
-        if logprobs is not None and self.context_params.logits_all is False:
+        if logprobs is not None and self._logits_all is False:
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
@@ -2091,10 +2093,12 @@ def __getstate__(self):
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            logits_all=self.context_params.logits_all,
+            logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
+            op_offloat=self.context_params.op_offloat,
+            swa_full=self.context_params.swa_full,
             # Sampling Params
             no_perf=self.context_params.no_perf,
             last_n_tokens_size=self.last_n_tokens_size,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 17575c700..a288db7b0 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -28,6 +28,7 @@
 import numpy as np
 import numpy.typing as npt
 
+import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama as llama
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_grammar as llama_grammar
@@ -2651,7 +2652,7 @@ def generate_streaming(tools, functions, function_call, prompt):
 
 class Llava15ChatHandler:
     DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
-        "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+        "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
     )
 
     CHAT_FORMAT = (
@@ -2690,70 +2691,72 @@ class Llava15ChatHandler:
     )
 
     def __init__(self, clip_model_path: str, verbose: bool = True):
-        import llama_cpp.llava_cpp as llava_cpp
+        import llama_cpp.mtmd_cpp as mtmd_cpp
 
         self.clip_model_path = clip_model_path
         self.verbose = verbose
-
-        self._llava_cpp = llava_cpp  # TODO: Fix
+        self._mtmd_cpp = mtmd_cpp
         self._exit_stack = ExitStack()
-        self._last_image_embed: Optional[
-            llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]
-        ] = None
-        self._last_image_hash: Optional[int] = None
+        self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
 
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
 
+    def _init_mtmd_context(self, llama_model: llama.Llama):
+        """Initialize mtmd context with the llama model."""
+        if self.mtmd_ctx is not None:
+            return  # Already initialized
+
         with suppress_stdout_stderr(disable=self.verbose):
-            clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
+            # Get default parameters
+            ctx_params = self._mtmd_cpp.mtmd_context_params_default()
+            ctx_params.use_gpu = True # TODO: Make this configurable
+            ctx_params.print_timings = self.verbose
+            ctx_params.n_threads = llama_model.n_threads
+            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+
+            # Initialize mtmd context
+            self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
+                self.clip_model_path.encode(),
+                llama_model.model,
+                ctx_params
+            )
 
-            if clip_ctx is None:
-                raise ValueError(f"Failed to load clip model: {clip_model_path}")
+            if self.mtmd_ctx is None:
+                raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}")
 
-            self.clip_ctx = clip_ctx
+            # Check if vision is supported
+            if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx):
+                raise ValueError("Vision is not supported by this model")
 
-            def clip_free():
+            def mtmd_free():
                 with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.clip_free(self.clip_ctx)
-
-            self._exit_stack.callback(clip_free)
+                    if self.mtmd_ctx is not None:
+                        self._mtmd_cpp.mtmd_free(self.mtmd_ctx)
+                        self.mtmd_ctx = None
 
-        def last_image_embed_free():
-            with suppress_stdout_stderr(disable=self.verbose):
-                if self._last_image_embed is not None:
-                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
-                    self._last_image_embed = None
-
-        self._exit_stack.callback(last_image_embed_free)
+            self._exit_stack.callback(mtmd_free)
 
     def load_image(self, image_url: str) -> bytes:
         return self._load_image(image_url)
 
-    def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
-        if (
-            self._last_image_embed is not None
-            and self._last_image_hash is not None
-            and hash(image_bytes) == self._last_image_hash
-        ):
-            return self._last_image_embed
+    def _create_bitmap_from_bytes(self, image_bytes: bytes):
+        """Create mtmd_bitmap from image bytes."""
+        if self.mtmd_ctx is None:
+            raise ValueError("mtmd context not initialized")
+
         with suppress_stdout_stderr(disable=self.verbose):
-            # Free the previous image embed
-            if self._last_image_embed is not None:
-                self._llava_cpp.llava_image_embed_free(self._last_image_embed)
-                self._last_image_embed = None
-                self._last_image_hash = None
-            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
-                self.clip_ctx,
-                n_threads_batch,
-                (ctypes.c_uint8 * len(image_bytes)).from_buffer(
-                    bytearray(image_bytes)
-                ),
-                len(image_bytes),
+            # Create bitmap from buffer using helper function
+            bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+                self.mtmd_ctx,
+                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
+                len(image_bytes)
             )
-            self._last_image_embed = embed
-            self._last_image_hash = hash(image_bytes)
-            return embed
+            
+            if bitmap is None:
+                raise ValueError("Failed to create bitmap from image bytes")
+            
+            return bitmap
 
     def __call__(
         self,
@@ -2794,7 +2797,9 @@ def __call__(
         llama_types.CreateChatCompletionResponse,
         Iterator[llama_types.CreateChatCompletionStreamResponse],
     ]:
-        assert self.clip_ctx is not None
+        # Initialize mtmd context
+        self._init_mtmd_context(llama)
+        assert self.mtmd_ctx is not None
 
         system_prompt = _get_system_message(messages)
         if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
@@ -2809,54 +2814,131 @@ def __call__(
             trim_blocks=True,
             lstrip_blocks=True,
         ).from_string(self.CHAT_FORMAT)
+        
+        # Get the default media marker
+        media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
+        
+        # Replace image URLs with media markers in the template
         text = template.render(
             messages=messages,
             add_generation_prompt=True,
             eos_token=llama.detokenize([llama.token_eos()]),
             bos_token=llama.detokenize([llama.token_bos()]),
         )
-        split_text = self.split_text_on_image_urls(text, image_urls)
+        
+        # Replace image URLs in text with media markers
+        for image_url in image_urls:
+            text = text.replace(image_url, media_marker)
 
         if self.verbose:
             print(text, file=sys.stderr)
 
+        # Create bitmaps from images
+        bitmaps = []
+        bitmap_cleanup = []
+        try:
+            for image_url in image_urls:
+                image_bytes = self.load_image(image_url)
+                bitmap = self._create_bitmap_from_bytes(image_bytes)
+                bitmaps.append(bitmap)
+                bitmap_cleanup.append(bitmap)
+
+            # Create input text structure
+            input_text = self._mtmd_cpp.mtmd_input_text()
+            input_text.text = text.encode('utf-8')
+            input_text.add_special = True
+            input_text.parse_special = True
+
+            # Create input chunks
+            chunks = self._mtmd_cpp.mtmd_input_chunks_init()
+            if chunks is None:
+                raise ValueError("Failed to create input chunks")
 
-        # Evaluate prompt
-        llama.reset()
-        llama._ctx.kv_cache_clear()
-        for type_, value in split_text:
-            if type_ == "text":
-                tokens = llama.tokenize(
-                    value.encode("utf8"), add_bos=False, special=True
+            try:
+                # Tokenize text and images together
+                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
+                result = self._mtmd_cpp.mtmd_tokenize(
+                    self.mtmd_ctx,
+                    chunks,
+                    ctypes.byref(input_text),
+                    bitmap_array,
+                    len(bitmaps)
                 )
-                if llama.n_tokens + len(tokens) > llama.n_ctx():
-                    raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
-                    )
-                llama.eval(tokens)
-            else:
-                image_bytes = self.load_image(value)
-                embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
-                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
-                    raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
-                    )
-                n_past = ctypes.c_int(llama.n_tokens)
-                n_past_p = ctypes.pointer(n_past)
-                with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.llava_eval_image_embed(
-                        llama.ctx,
-                        embed,
-                        llama.n_batch,
-                        n_past_p,
-                    )
-                # Required to avoid issues with hf tokenizer
-                llama.input_ids[llama.n_tokens : n_past.value] = -1
-                llama.n_tokens = n_past.value
 
-        # Get prompt tokens to avoid a cache miss
-        prompt = llama.input_ids[: llama.n_tokens].tolist()
+                if result != 0:
+                    raise ValueError(f"Failed to tokenize input: error code {result}")
+
+                # Reset llama context
+                llama.reset()
+                llama._ctx.kv_cache_clear()
+
+                # Process each chunk
+                n_past = llama_cpp.llama_pos(0)
+                n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
+                
+                for i in range(n_chunks):
+                    chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
+                    if chunk is None:
+                        continue
+
+                    chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
+                    
+                    if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                        # Handle text chunk
+                        n_tokens_out = ctypes.c_size_t()
+                        tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
+                            chunk, ctypes.byref(n_tokens_out)
+                        )
+                        
+                        if tokens_ptr and n_tokens_out.value > 0:
+                            # Convert ctypes array to Python list
+                            tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
+                            
+                            if llama.n_tokens + len(tokens) > llama.n_ctx():
+                                raise ValueError(
+                                    f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
+                                )
+                            llama.eval(tokens)
+                    
+                    elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
+                        # Handle image/audio chunk using helper
+                        chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
+                        
+                        if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
+                            raise ValueError(
+                                f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
+                            )
+                        
+                        new_n_past = llama_cpp.llama_pos(0)
+                        result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
+                            self.mtmd_ctx,
+                            llama._ctx.ctx,
+                            chunk,
+                            llama_cpp.llama_pos(llama.n_tokens),
+                            llama_cpp.llama_seq_id(0),
+                            llama.n_batch,
+                            False,  # logits_last
+                            ctypes.byref(new_n_past)
+                        )
+                        
+                        if result != 0:
+                            raise ValueError(f"Failed to evaluate chunk: error code {result}")
+                        
+                        # Update llama's token count
+                        llama.n_tokens = new_n_past.value
+
+                # Get prompt tokens to avoid a cache miss
+                prompt = llama.input_ids[: llama.n_tokens].tolist()
 
+            finally:
+                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+
+        finally:
+            # Cleanup bitmaps
+            for bitmap in bitmap_cleanup:
+                self._mtmd_cpp.mtmd_bitmap_free(bitmap)
+
+        # Handle response format and tools (same as before)
         if response_format is not None and response_format["type"] == "json_object":
             grammar = _grammar_for_response_format(response_format)
 
@@ -2931,6 +3013,7 @@ def __call__(
             grammar=grammar,
             logit_bias=logit_bias,
         )
+        
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
@@ -2943,12 +3026,10 @@ def _load_image(image_url: str) -> bytes:
         # TODO: Add Pillow support for other image formats beyond (jpg, png)
         if image_url.startswith("data:"):
             import base64
-
             image_bytes = base64.b64decode(image_url.split(",")[1])
             return image_bytes
         else:
             import urllib.request
-
             with urllib.request.urlopen(image_url) as f:
                 image_bytes = f.read()
                 return image_bytes
@@ -2974,6 +3055,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
 
     @staticmethod
     def split_text_on_image_urls(text: str, image_urls: List[str]):
+        """This method is no longer used in the new implementation."""
         def find_first(s: str, substrs: List[str]):
             for i, substr in enumerate(substrs):
                 pos = s.find(substr)
@@ -3373,6 +3455,61 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
     )
 
 
+class Qwen25VLChatHandler(Llava15ChatHandler):
+    DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
+
+    CHAT_FORMAT = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] }}"
+        "{% else %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] == 'text' %}"
+        "{{ content['text'] }}"
+        "{% elif content['type'] == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% else %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "<|im_start|>assistant\n"
+    )
+
+    def __call__(self, **kwargs):
+        llama = kwargs['llama']
+
+        # Clear state for multiple runs
+        llama.reset()
+        llama._ctx.kv_cache_clear()
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        # Clear any handler state
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            image_count = len(self.get_image_urls(messages))
+            print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr)
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 205d89a0b..d13d60458 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -161,9 +161,13 @@
 llama_context_p = NewType("llama_context_p", int)
 llama_context_p_ctypes = ctypes.c_void_p
 
-# # struct llama_sampler;
-# llama_sampler_p = NewType("llama_sampler_p", int)
-# llama_sampler_p_ctypes = ctypes.c_void_p
+# typedef struct llama_memory_i * llama_memory_t;
+llama_memory_t = NewType("llama_memory_t", int)
+llama_memory_t_ctypes = ctypes.c_void_p
+
+# struct llama_kv_cache; (DEPRECATED)
+llama_kv_cache_p = NewType("llama_kv_cache_p", int)
+llama_kv_cache_p_ctypes = ctypes.c_void_p
 
 # typedef int32_t llama_pos;
 llama_pos = ctypes.c_int32
@@ -227,6 +231,13 @@
 #     LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
 #     LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
 #     LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+#     LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
+#     LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
+#     LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
+#     LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
+#     LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+#     LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+#     LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -244,7 +255,7 @@
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
 LLAMA_VOCAB_PRE_TYPE_PORO = 15
-LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
+LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
 LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
 LLAMA_VOCAB_PRE_TYPE_VIKING = 18
 LLAMA_VOCAB_PRE_TYPE_JAIS = 19
@@ -257,6 +268,13 @@
 LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
+LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
+LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
+LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
+LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
+LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
+LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
+LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -405,14 +423,14 @@
 #     LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
 #     LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
 #     LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
-#     LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
+#     LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
 # };
 LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
 LLAMA_ROPE_SCALING_TYPE_NONE = 0
 LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
 LLAMA_ROPE_SCALING_TYPE_YARN = 2
 LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3
-LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
+LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE
 
 # enum llama_pooling_type {
 #     LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
@@ -442,7 +460,7 @@
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
 #     LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-#     LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
+#     LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
 # };
 LLAMA_SPLIT_MODE_NONE = 0
 LLAMA_SPLIT_MODE_LAYER = 1
@@ -516,18 +534,21 @@ class llama_token_data_array(ctypes.Structure):
 )
 
 
-# // Input data for llama_decode
+# // Input data for llama_encode/llama_decode
 # // A llama_batch object can contain input about one or many sequences
 # // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 # //
 # // - token  : the token ids of the input (used when embd is NULL)
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
-# //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+# //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
 # // - seq_id : the sequence to which the respective token belongs
 # //            (if set to NULL, the sequence ID will be assumed to be 0)
 # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-# //            (if set to NULL, only the logits for last token will be returned)
+# //            (if set to NULL:
+# //               - if embeddings: all tokens are output
+# //               - if not:        only the last token is output
+# //            )
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@@ -537,10 +558,10 @@ class llama_token_data_array(ctypes.Structure):
 #     llama_pos    *  pos;
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
-#     int8_t       *  logits; // TODO: rename this to "output"
+#     int8_t       *  logits;   // TODO: rename this to "output"
 # } llama_batch;
 class llama_batch(ctypes.Structure):
-    """Input data for llama_decode
+    """Input data for llama_encode/llama_decode
 
     A llama_batch object can contain input about one or many sequences
 
@@ -628,17 +649,23 @@ class llama_model_kv_override(ctypes.Structure):
         value: Union[int, float, bool, bytes]
 
 
+# struct llama_model_tensor_buft_override {
+#     const char * pattern;
+#     ggml_backend_buffer_type_t buft;
+# };
+
+
 # struct llama_model_params {
 #     // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
 #     ggml_backend_dev_t * devices;
 
+#     // NULL-terminated list of buffer types to use for tensors that match a pattern
+#     const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
-#     // main_gpu interpretation depends on split_mode:
-#     // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
-#     // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
-#     // LLAMA_SPLIT_MODE_LAYER: ignored
+#     // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
 #     int32_t main_gpu;
 
 #     // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -655,7 +682,6 @@ class llama_model_kv_override(ctypes.Structure):
 #     // override key-value pairs of the model meta data
 #     const struct llama_model_kv_override * kv_overrides;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only;    // only load the vocabulary, no weights
 #     bool use_mmap;      // use mmap if possible
@@ -666,9 +692,11 @@ class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
 
     Attributes:
+        devices (ctypes.Array[ggml_backend_dev_t]): NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        tensor_buft_overrides (ctypes.Array[llama_model_tensor_buft_override]): NULL-terminated list of buffer types to use for tensors that match a pattern
         n_gpu_layers (int): number of layers to store in VRAM
         split_mode (int): how to split the model across multiple GPUs
-        main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
+        main_gpu (int): the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
@@ -679,6 +707,8 @@ class llama_model_params(ctypes.Structure):
         check_tensors (bool): validate model tensor data"""
 
     if TYPE_CHECKING:
+        devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
+        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
         n_gpu_layers: int
         split_mode: int
         main_gpu: int
@@ -693,6 +723,7 @@ class llama_model_params(ctypes.Structure):
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
+        ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -708,7 +739,7 @@ class llama_model_params(ctypes.Structure):
 
 
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-# //       https://github.com/ggerganov/llama.cpp/pull/7544
+# //       https://github.com/ggml-org/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t n_ctx;             // text context, 0 = from model
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
@@ -721,7 +752,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
 
-#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
 #     float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
 #     float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
@@ -729,7 +760,7 @@ class llama_model_params(ctypes.Structure):
 #     float    yarn_beta_fast;   // YaRN low correction dim
 #     float    yarn_beta_slow;   // YaRN high correction dim
 #     uint32_t yarn_orig_ctx;    // YaRN original context size
-#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
 
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
@@ -737,19 +768,21 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
 #     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
-#     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-#     bool embeddings;  // if true, extract embeddings (together with logits)
-#     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-#     bool no_perf;     // whether to measure performance timings
-
-
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
 #     ggml_abort_callback abort_callback;
 #     void *              abort_callback_data;
+
+#     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+#     bool embeddings;  // if true, extract embeddings (together with logits)
+#     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+#     bool flash_attn;  // use flash attention [EXPERIMENTAL]
+#     bool no_perf;     // measure performance timings
+#     bool op_offload;  // offload host tensor operations to device
+#     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+#                       // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+#                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -771,18 +804,19 @@ class llama_context_params(ctypes.Structure):
         yarn_beta_fast (float): YaRN low correction dim
         yarn_beta_slow (float): YaRN high correction dim
         yarn_orig_ctx (int): YaRN original context size
-        defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
+        defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
         cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
-        logits_all (bool): the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
+        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
-        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
-        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
+        op_offload (bool): offload host tensor operations to device
+        swa_full (bool): use full-size SWA cache
     """
 
     if TYPE_CHECKING:
@@ -807,13 +841,14 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data: ctypes.c_void_p
         type_k: int
         type_v: int
-        logits_all: bool
+        abort_callback: Callable[[ctypes.c_void_p], bool]
+        abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
         flash_attn: bool
         no_perf: bool
-        abort_callback: Callable[[ctypes.c_void_p], bool]
-        abort_callback_data: ctypes.c_void_p
+        op_offload: bool
+        swa_full: bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -837,13 +872,14 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
-        ("logits_all", ctypes.c_bool),
+        ("abort_callback", ggml_abort_callback),
+        ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
         ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
-        ("abort_callback", ggml_abort_callback),
-        ("abort_callback_data", ctypes.c_void_p),
+        ("op_offload", ctypes.c_bool),
+        ("swa_full", ctypes.c_bool),
     ]
 
 
@@ -865,17 +901,19 @@ class llama_context_params(ctypes.Structure):
 
 # // model quantization parameters
 # typedef struct llama_model_quantize_params {
-#     int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-#     enum llama_ftype ftype;              // quantize to this llama_ftype
-#     enum ggml_type output_tensor_type;   // output tensor type
-#     enum ggml_type token_embedding_type; // token embeddings tensor type
-#     bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-#     bool quantize_output_tensor;         // quantize output.weight
-#     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-#     bool pure;                           // quantize all tensors to the default type
-#     bool keep_split;                     // quantize to the same number of shards
-#     void * imatrix;                      // pointer to importance matrix data
-#     void * kv_overrides;                 // pointer to vector containing overrides
+#     int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+#     enum llama_ftype ftype;               // quantize to this llama_ftype
+#     enum ggml_type output_tensor_type;    // output tensor type
+#     enum ggml_type token_embedding_type;  // token embeddings tensor type
+#     bool allow_requantize;                // allow quantizing non-f32/f16 tensors
+#     bool quantize_output_tensor;          // quantize output.weight
+#     bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+#     bool pure;                            // quantize all tensors to the default type
+#     bool keep_split;                      // quantize to the same number of shards
+#     void * imatrix;                       // pointer to importance matrix data
+#     void * kv_overrides;                  // pointer to vector containing overrides
+#     void * tensor_types;                  // pointer to vector containing tensor types
+#     void * prune_layers;                  // pointer to vector containing layer indices to prune
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
     """Parameters for llama_model_quantize
@@ -892,6 +930,8 @@ class llama_model_quantize_params(ctypes.Structure):
         keep_split (bool): quantize to the same number of shards
         imatrix (ctypes.c_void_p): pointer to importance matrix data
         kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
+        tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
+        prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune
     """
 
     if TYPE_CHECKING:
@@ -906,6 +946,8 @@ class llama_model_quantize_params(ctypes.Structure):
         keep_split: bool
         imatrix: ctypes.c_void_p
         kv_overrides: ctypes.c_void_p
+        tensor_types: ctypes.c_void_p
+        prune_layers: ctypes.c_void_p
 
     _fields_ = [
         ("nthread", ctypes.c_int32),
@@ -919,6 +961,8 @@ class llama_model_quantize_params(ctypes.Structure):
         ("keep_split", ctypes.c_bool),
         ("imatrix", ctypes.c_void_p),
         ("kv_overrides", ctypes.c_void_p),
+        ("tensor_types", ctypes.c_void_p),
+        ("prune_layers", ctypes.c_void_p),
     ]
 
 
@@ -1029,7 +1073,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_backend_init(bool numa);
 # LLAMA_API void llama_backend_init(void);
 @ctypes_function(
     "llama_backend_init",
@@ -1038,7 +1081,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
 )
 def llama_backend_init():
     """Initialize the llama + ggml backend
-    If numa is true, use NUMA optimizations
     Call once at the start of the program"""
     ...
 
@@ -1152,7 +1194,21 @@ def llama_model_load_from_splits(
     ...
 
 
-# LLAMA_API void llama_free_model(struct llama_model * model);
+# LLAMA_API void llama_model_save_to_file(
+#         const struct llama_model * model,
+#                     const char * path_model);
+@ctypes_function(
+    "llama_model_save_to_file",
+    [llama_model_p_ctypes, ctypes.c_char_p],
+    None,
+)
+def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
+    """Save the model to a file"""
+    ...
+
+
+# DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+#         "use llama_model_free instead");
 @ctypes_function(
     "llama_free_model",
     [llama_model_p_ctypes],
@@ -1229,6 +1285,12 @@ def llama_max_devices() -> int:
     ...
 
 
+# LLAMA_API size_t llama_max_parallel_sequences(void);
+@ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t)
+def llama_max_parallel_sequences() -> int:
+    ...
+
+
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
 def llama_supports_mmap() -> bool:
@@ -1277,8 +1339,6 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int:
     ...
 
 
-
-
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_ctx_train(model: llama_model_p, /) -> int:
@@ -1315,12 +1375,30 @@ def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
     ...
 
 
-# LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+# LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
+@ctypes_function("llama_get_memory", [llama_context_p_ctypes], llama_memory_t_ctypes)
+def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]:
+    """Get the memory for the context"""
+    ...
+
+
+# LLAMA_API  enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
 def llama_pooling_type(ctx: llama_context_p, /) -> int:
     ...
 
 
+# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
+@ctypes_function(
+    "llama_get_kv_self",
+    [llama_context_p_ctypes],
+    llama_kv_cache_p_ctypes,
+)
+def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+    """Get the KV cache for self-attention (DEPRECATED)"""
+    ...
+
+
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
 def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
@@ -1357,6 +1435,18 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_head_kv(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
+@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_swa(model: llama_model_p, /) -> int:
+    ...
+
+
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -1364,9 +1454,26 @@ def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float:
     ...
 
 
+# // Returns the number of classifier outputs (only valid for classifier models)
+# // Undefined behavior for non-classifier models
+# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32)
+def llama_model_n_cls_out(model: llama_model_p, /) -> int:
+    """Returns the number of classifier outputs (only valid for classifier models)"""
+    ...
+
+
+# // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+# LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
+@ctypes_function("llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p)
+def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
+    """Returns label of classifier output by index. Returns None if no label provided"""
+    ...
+
+
 # LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_vocab_type(model: llama_model_p, /) -> int:
+@ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int)
+def llama_vocab_type(vocab: llama_vocab_p, /) -> int:
     ...
 
 
@@ -1564,6 +1671,10 @@ def llama_model_quantize(
     ...
 
 
+# //
+# // Adapters
+# //
+
 # // Load a LoRA adapter from file
 # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 #         struct llama_model * model,
@@ -1687,148 +1798,256 @@ def llama_apply_adapter_cvec(
 
 
 # //
-# // KV cache
+# // Memory
 # //
 
+# // Clear the memory contents
+# // If data == true, the data buffers will also be cleared together with the metadata
+# LLAMA_API void llama_memory_clear(
+#         llama_memory_t mem,
+#                   bool data);
+@ctypes_function(
+    "llama_memory_clear",
+    [llama_memory_t_ctypes, ctypes.c_bool],
+    None,
+)
+def llama_memory_clear(mem: llama_memory_t, data: bool, /):
+    """Clear the memory contents
+    If data == true, the data buffers will also be cleared together with the metadata"""
+    ...
 
-# // Information associated with an individual cell in the KV cache view.
-# struct llama_kv_cache_view_cell {
-#     // The position for this cell. Takes KV cache shifts into account.
-#     // May be negative if the cell is not populated.
-#     llama_pos pos;
-# };
-class llama_kv_cache_view_cell(ctypes.Structure):
-    """Information associated with an individual cell in the KV cache view.
-
-    Attributes:
-        pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
-            May be negative if the cell is not populated."""
-
-    if TYPE_CHECKING:
-        pos: llama_pos
-
-    _fields_ = [("pos", llama_pos)]
-
-
-# // An updateable view of the KV cache.
-# struct llama_kv_cache_view {
-#     // Number of KV cache cells. This will be the same as the context size.
-#     int32_t n_cells;
 
-#     // Maximum number of sequences that can exist in a cell. It's not an error
-#     // if there are more sequences in a cell than this value, however they will
-#     // not be visible in the view cells_sequences.
-#     int32_t n_seq_max;
+# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+# // seq_id < 0 : match any sequence
+# // p0 < 0     : [0,  p1]
+# // p1 < 0     : [p0, inf)
+# LLAMA_API bool llama_memory_seq_rm(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1);
+@ctypes_function(
+    "llama_memory_seq_rm",
+    [
+        llama_memory_t_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+    ],
+    ctypes.c_bool,
+)
+def llama_memory_seq_rm(
+    mem: llama_memory_t,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    /,
+) -> bool:
+    """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
 
-#     // Number of tokens in the cache. For example, if there are two populated
-#     // cells, the first with 1 sequence id in it and the second with 2 sequence
-#     // ids then you'll have 3 tokens.
-#     int32_t token_count;
+    Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
 
-#     // Number of populated cache cells.
-#     int32_t used_cells;
+    seq_id < 0 : match any sequence
+    p0 < 0     : [0,  p1]
+    p1 < 0     : [p0, inf)"""
+    ...
 
-#     // Maximum contiguous empty slots in the cache.
-#     int32_t max_contiguous;
 
-#     // Index to the start of the max_contiguous slot range. Can be negative
-#     // when cache is full.
-#     int32_t max_contiguous_idx;
+# // Copy all tokens that belong to the specified sequence to another sequence
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# LLAMA_API void llama_memory_seq_cp(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id_src,
+#           llama_seq_id seq_id_dst,
+#              llama_pos p0,
+#              llama_pos p1);
+@ctypes_function(
+    "llama_memory_seq_cp",
+    [
+        llama_memory_t_ctypes,
+        llama_seq_id,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+    ],
+    None,
+)
+def llama_memory_seq_cp(
+    mem: llama_memory_t,
+    seq_id_src: Union[llama_seq_id, int],
+    seq_id_dst: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    /,
+):
+    """Copy all tokens that belong to the specified sequence to another sequence
+    p0 < 0 : [0,  p1]
+    p1 < 0 : [p0, inf)"""
+    ...
 
-#     // Information for an individual cell.
-#     struct llama_kv_cache_view_cell * cells;
 
+# // Removes all tokens that do not belong to the specified sequence
+# LLAMA_API void llama_memory_seq_keep(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
+@ctypes_function(
+    "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None
+)
+def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /):
+    """Removes all tokens that do not belong to the specified sequence"""
+    ...
 
-#     // The sequences for each cell. There will be n_seq_max items per cell.
-#     llama_seq_id * cells_sequences;
-# };
-class llama_kv_cache_view(ctypes.Structure):
-    if TYPE_CHECKING:
-        n_cells: int
-        n_max_seq: int
-        token_count: int
-        used_cells: int
-        max_contiguous: int
-        max_contiguous_idx: int
-        cells: CtypesArray[llama_kv_cache_view_cell]
-        cells_sequences: CtypesArray[llama_seq_id]
 
-    _fields_ = [
-        ("n_cells", ctypes.c_int32),
-        ("n_max_seq", ctypes.c_int32),
-        ("token_count", ctypes.c_int32),
-        ("used_cells", ctypes.c_int32),
-        ("max_contiguous", ctypes.c_int32),
-        ("max_contiguous_idx", ctypes.c_int32),
-        ("cells", ctypes.POINTER(llama_kv_cache_view_cell)),
-        ("cells_sequences", ctypes.POINTER(llama_seq_id)),
-    ]
+# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# LLAMA_API void llama_memory_seq_add(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1,
+#              llama_pos delta);
+@ctypes_function(
+    "llama_memory_seq_add",
+    [
+        llama_memory_t_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+        llama_pos,
+    ],
+    None,
+)
+def llama_memory_seq_add(
+    mem: llama_memory_t,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    delta: Union[llama_pos, int],
+    /,
+):
+    """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    p0 < 0 : [0,  p1]
+    p1 < 0 : [p0, inf)"""
+    ...
 
 
-llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)
+# // Integer division of the positions by factor of `d > 1`
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# LLAMA_API void llama_memory_seq_div(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1,
+#                    int d);
+@ctypes_function(
+    "llama_memory_seq_div",
+    [
+        llama_memory_t_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+        ctypes.c_int,
+    ],
+    None,
+)
+def llama_memory_seq_div(
+    mem: llama_memory_t,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    d: Union[ctypes.c_int, int],
+    /,
+):
+    """Integer division of the positions by factor of `d > 1`
+    p0 < 0 : [0,  p1]
+    p1 < 0 : [p0, inf)"""
+    ...
 
 
-# // Create an empty KV cache view. (use only for debugging purposes)
-# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
+# // Returns the smallest position present in the memory for the specified sequence
+# // This is typically non-zero only for SWA caches
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+# // Return -1 if the sequence is empty
+# LLAMA_API llama_pos llama_memory_seq_pos_min(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
 @ctypes_function(
-    "llama_kv_cache_view_init",
-    [llama_context_p_ctypes, ctypes.c_int32],
-    llama_kv_cache_view,
+    "llama_memory_seq_pos_min", [llama_memory_t_ctypes, llama_seq_id], llama_pos
 )
-def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
-) -> llama_kv_cache_view:
-    """Create an empty KV cache view. (use only for debugging purposes)"""
+def llama_memory_seq_pos_min(
+    mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the smallest position present in the memory for the specified sequence
+    This is typically non-zero only for SWA caches
+    Return -1 if the sequence is empty"""
     ...
 
 
-# // Free a KV cache view. (use only for debugging purposes)
-# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-@ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
-def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /):  # type: ignore
-    """Free a KV cache view. (use only for debugging purposes)"""
+# // Returns the largest position present in the memory for the specified sequence
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+# // Return -1 if the sequence is empty
+# LLAMA_API llama_pos llama_memory_seq_pos_max(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
+@ctypes_function(
+    "llama_memory_seq_pos_max", [llama_memory_t_ctypes, llama_seq_id], llama_pos
+)
+def llama_memory_seq_pos_max(
+    mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the largest position present in the memory for the specified sequence
+    Return -1 if the sequence is empty"""
     ...
 
 
-# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-@ctypes_function(
-    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None
-)
-def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /):  # type: ignore
-    """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
+# // Check if the memory supports shifting
+# LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
+@ctypes_function("llama_memory_can_shift", [llama_memory_t_ctypes], ctypes.c_bool)
+def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
+    """Check if the memory supports shifting"""
     ...
 
 
+# //
+# // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
+# //
+
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
 @ctypes_function(
-    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
+    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
 )
-def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug)
-    If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    """
+def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
     ...
 
 
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
 @ctypes_function(
-    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
 )
-def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
+def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
+    """Returns the number of used KV cells (DEPRECATED)"""
     ...
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
-# LLAMA_API void llama_kv_cache_clear(
-#         struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
-def llama_kv_cache_clear(ctx: llama_context_p, /):
-    """Clear the KV cache"""
+# DEPRECATED(LLAMA_API void llama_kv_self_clear(
+#             struct llama_context * ctx),
+#         "Use llama_memory_clear() instead");
+@ctypes_function(
+    "llama_kv_self_clear", [llama_context_p_ctypes], None
+)
+def llama_kv_self_clear(ctx: llama_context_p, /):
+    """Clear the KV cache (DEPRECATED)"""
     ...
 
 
@@ -1837,13 +2056,14 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 # // seq_id < 0 : match any sequence
 # // p0 < 0     : [0,  p1]
 # // p1 < 0     : [p0, inf)
-# LLAMA_API bool llama_kv_cache_seq_rm(
+# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
-#                    llama_pos   p1);
+#                    llama_pos   p1),
+#         "Use llama_memory_seq_rm() instead");
 @ctypes_function(
-    "llama_kv_cache_seq_rm",
+    "llama_kv_self_seq_rm",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1852,20 +2072,14 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
     ],
     ctypes.c_bool,
 )
-def llama_kv_cache_seq_rm(
+def llama_kv_self_seq_rm(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
     /,
 ) -> bool:
-    """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-
-    Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-
-    seq_id < 0 : match any sequence
-    p0 < 0     : [0,  p1]
-    p1 < 0     : [p0, inf)"""
+    """Remove tokens from KV cache (DEPRECATED)"""
     ...
 
 
@@ -1873,14 +2087,15 @@ def llama_kv_cache_seq_rm(
 # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_cp(
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id_src,
 #                 llama_seq_id   seq_id_dst,
 #                    llama_pos   p0,
-#                    llama_pos   p1);
+#                    llama_pos   p1),
+#         "Use llama_memory_seq_cp() instead");
 @ctypes_function(
-    "llama_kv_cache_seq_cp",
+    "llama_kv_self_seq_cp",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1890,7 +2105,7 @@ def llama_kv_cache_seq_rm(
     ],
     None,
 )
-def llama_kv_cache_seq_cp(
+def llama_kv_self_seq_cp(
     ctx: llama_context_p,
     seq_id_src: Union[llama_seq_id, int],
     seq_id_dst: Union[llama_seq_id, int],
@@ -1898,39 +2113,37 @@ def llama_kv_cache_seq_cp(
     p1: Union[llama_pos, int],
     /,
 ):
-    """Copy all tokens that belong to the specified sequence to another sequence
-    Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-    p0 < 0 : [0,  p1]
-    p1 < 0 : [p0, inf)"""
+    """Copy tokens in KV cache (DEPRECATED)"""
     ...
 
 
 # // Removes all tokens that do not belong to the specified sequence
-# LLAMA_API void llama_kv_cache_seq_keep(
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
 #         struct llama_context * ctx,
-#                 llama_seq_id   seq_id);
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_keep() instead");
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Removes all tokens that do not belong to the specified sequence"""
+def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+    """Keep only specified sequence in KV cache (DEPRECATED)"""
     ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 # // If the KV cache is RoPEd, the KV data is updated accordingly:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_add(
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
-#                    llama_pos   delta);
+#                    llama_pos   delta),
+#         "Use llama_memory_seq_add() instead");
 @ctypes_function(
-    "llama_kv_cache_seq_add",
+    "llama_kv_self_seq_add",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1940,7 +2153,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
     ],
     None,
 )
-def llama_kv_cache_seq_add(
+def llama_kv_self_seq_add(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1948,27 +2161,24 @@ def llama_kv_cache_seq_add(
     delta: Union[llama_pos, int],
     /,
 ):
-    """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    If the KV cache is RoPEd, the KV data is updated accordingly:
-    - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()
-    p0 < 0 : [0,  p1]
-    p1 < 0 : [p0, inf)"""
+    """Add delta to sequence positions in KV cache (DEPRECATED)"""
     ...
 
 
 # // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly
+# // If the KV cache is RoPEd, the KV data is updated accordingly:
+# //   - lazily on next llama_decode()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_div(
+# DEPRECATED(void llama_kv_self_seq_div(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
-#                          int   d);
+#                          int   d),
+#         "Use llama_memory_seq_div() instead");
 @ctypes_function(
-    "llama_kv_cache_seq_div",
+    "llama_kv_self_seq_div",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1978,7 +2188,7 @@ def llama_kv_cache_seq_add(
     ],
     None,
 )
-def llama_kv_cache_seq_div(
+def llama_kv_self_seq_div(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1986,40 +2196,71 @@ def llama_kv_cache_seq_div(
     d: Union[ctypes.c_int, int],
     /,
 ):
-    """Integer division of the positions by factor of `d > 1`
-    If the KV cache is RoPEd, the KV data is updated accordingly
-    p0 < 0 : [0,  p1]
-    p1 < 0 : [p0, inf)"""
+    """Divide sequence positions in KV cache (DEPRECATED)"""
+    ...
+
+
+# // Returns the smallest position present in the KV cache for the specified sequence
+# // This is typically non-zero only for SWA caches
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+# // Return -1 if the sequence is empty
+# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_pos_min() instead");
+@ctypes_function(
+    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
+)
+def llama_kv_self_seq_pos_min(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
+    ...
+
+
+# // Returns the largest position present in the KV cache for the specified sequence
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+# // Return -1 if the sequence is empty
+# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_pos_max() instead");
+@ctypes_function(
+    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
+)
+def llama_kv_self_seq_pos_max(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
     ...
 
 
 # // Defragment the KV cache
 # // This will be applied:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
-# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
-def llama_kv_cache_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache
-    This will be applied:
-    - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()"""
+# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
+#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+def llama_kv_self_defrag(ctx: llama_context_p, /):
+    """Defragment the KV cache (DEPRECATED)"""
     ...
 
 
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
-def llama_kv_cache_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
+# // Check if the context supports KV cache shifting
+# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
+#         "use llama_memory_can_shift() instead");
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
+    """Check if the context supports KV cache shifting (DEPRECATED)"""
     ...
 
 
-# // Check if the context supports KV cache shifting
-# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting"""
+# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
+#         "simply remove this call, updates are applied lazily on the next llama_decode()");
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+def llama_kv_self_update(ctx: llama_context_p, /):
+    """Apply the KV cache updates (DEPRECATED)"""
     ...
 
 
@@ -2027,14 +2268,13 @@ def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
 # // State / sessions
 # //
 
-
 # // Returns the *actual* size in bytes of the state
-# // (logits, embedding and kv_cache)
+# // (logits, embedding and memory)
 # // Only use when saving the state, not when restoring it, otherwise the size may be too small.
 # LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
 @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_state_get_size(ctx: llama_context_p, /) -> int:
-    """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
+    """Returns the *actual* size in bytes of the state (logits, embedding and memory)"""
     ...
 
 
@@ -2042,8 +2282,7 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int:
 #     "use llama_state_get_size instead");
 @ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_get_state_size(ctx: llama_context_p, /) -> int:
-    """Returns the maximum size in bytes of the state (rng, logits, embedding
-    and kv_cache) - will often be smaller after compacting tokens"""
+    """Returns the size in bytes of the state (DEPRECATED)"""
     ...
 
 
@@ -2090,9 +2329,7 @@ def llama_state_get_data(
 def llama_copy_state_data(
     ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
 ) -> int:
-    """Copies the state to the specified destination address.
-    Destination needs to have allocated enough memory.
-    Returns the number of bytes copied"""
+    """Copies the state to the specified destination address (DEPRECATED)"""
     ...
 
 
@@ -2130,7 +2367,7 @@ def llama_state_set_data(
 def llama_set_state_data(
     ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
 ) -> int:
-    """Set the state reading from the specified address"""
+    """Set the state reading from the specified address (DEPRECATED)"""
     ...
 
 
@@ -2179,7 +2416,7 @@ def llama_state_load_file(
         ctypes.c_size_t,
         ctypes.POINTER(ctypes.c_size_t),
     ],
-    ctypes.c_size_t,
+    ctypes.c_bool,
 )
 def llama_load_session_file(
     ctx: llama_context_p,
@@ -2188,7 +2425,7 @@ def llama_load_session_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> int:
+) -> bool:
     ...
 
 
@@ -2231,7 +2468,7 @@ def llama_state_save_file(
         llama_token_p,
         ctypes.c_size_t,
     ],
-    ctypes.c_size_t,
+    ctypes.c_bool,
 )
 def llama_save_session_file(
     ctx: llama_context_p,
@@ -2239,11 +2476,11 @@ def llama_save_session_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> int:
+) -> bool:
     ...
 
 
-# // Get the exact size needed to copy the KV cache of a single sequence
+# // Get the exact size needed to copy the state of a single sequence
 # LLAMA_API size_t llama_state_seq_get_size(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
@@ -2253,11 +2490,11 @@ def llama_save_session_file(
     ctypes.c_size_t,
 )
 def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
-    """Get the exact size needed to copy the KV cache of a single sequence"""
+    """Get the exact size needed to copy the state of a single sequence"""
     ...
 
 
-# // Copy the KV cache of a single sequence into the specified buffer
+# // Copy the state of a single sequence into the specified buffer
 # LLAMA_API size_t llama_state_seq_get_data(
 #         struct llama_context * ctx,
 #                      uint8_t * dst,
@@ -2280,7 +2517,7 @@ def llama_state_seq_get_data(
     seq_id: llama_seq_id,
     /,
 ) -> int:
-    """Copy the KV cache of a single sequence into the specified buffer"""
+    """Copy the state of a single sequence into the specified buffer"""
     ...
 
 
@@ -2310,7 +2547,7 @@ def llama_state_seq_set_data(
     dest_seq_id: llama_seq_id,
     /,
 ) -> int:
-    """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
+    """Copy the sequence data into the specified sequence"""
     ...
 
 
@@ -2377,7 +2614,6 @@ def llama_state_seq_load_file(
 # // Decoding
 # //
 
-
 # // Return batch for single sequence of tokens
 # // The sequence ID will be fixed to 0
 # // The position of the tokens will be tracked automatically by llama_decode
@@ -2400,7 +2636,7 @@ def llama_batch_get_one(
     n_tokens: Union[ctypes.c_int, int],
     /,
 ) -> llama_batch:
-    """Return batch for single sequence of tokens starting at pos_0
+    """Return batch for single sequence of tokens
 
     NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     """
@@ -2445,35 +2681,46 @@ def llama_batch_free(batch: llama_batch, /):
     ...
 
 
-# // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-# // Stores the encoder output internally for later use by the decoder cross-attention layers.
+# // Process a batch of tokens.
+# // In contrast to llama_decode() - this call does not use KV cache.
+# // For encode-decoder contexts, processes the batch using the encoder.
+# // Can store the encoder output internally for later use by the decoder's cross-attention layers.
 # //   0 - success
-# // < 0 - error
+# // < 0 - error. the memory state is restored to the state before this call
 # LLAMA_API int32_t llama_encode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
 @ctypes_function("llama_encode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32)
 def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
-    """Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-    Stores the encoder output internally for later use by the decoder cross-attention layers.
+    """Process a batch of tokens using the encoder.
     0 - success
     < 0 - error"""
     ...
 
 
+# // Process a batch of tokens.
+# // Requires the context to have a memory.
+# // For encode-decoder contexts, processes the batch using the decoder.
 # // Positive return values does not mean a fatal error, but rather a warning.
-# //   0 - success
-# //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-# // < 0 - error
+# // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+# //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+# // Upon other return values, the memory state is restored to the state before this call
+# //    0 - success
+# //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+# //    2 - aborted     (processed ubatches will remain in the context's memory)
+# //   -1 - invalid input batch
+# // < -1 - fatal error (processed ubatches will remain in the context's memory)
 # LLAMA_API int32_t llama_decode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
 @ctypes_function("llama_decode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32)
 def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
-    """Positive return values does not mean a fatal error, but rather a warning.
+    """Process a batch of tokens.
     0 - success
     1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    < 0 - error"""
+    2 - aborted (processed ubatches will remain in the context's memory)
+    -1 - invalid input batch
+    < -1 - fatal error (processed ubatches will remain in the context's memory)"""
     ...
 
 
@@ -2519,13 +2766,12 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
     ...
 
 
-# // Set whether the model is in embeddings mode or not
-# // If true, embeddings will be returned but logits will not
+# // Set whether the context outputs embeddings or not
+# // TODO: rename to avoid confusion with llama_get_embeddings()
 # LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 @ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
 def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
-    """Set whether the model is in embeddings model or not
-    If true, embeddings will be returned but logits will not"""
+    """Set whether the context outputs embeddings or not"""
     ...
 
 
@@ -2539,6 +2785,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     ...
 
 
+# // Set whether the model is in warmup mode or not
+# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
+    """Set whether the model is in warmup mode or not
+    If true, all model tensors are activated during llama_decode() to load and cache their weights."""
+    ...
+
+
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 @ctypes_function(
@@ -2644,7 +2900,7 @@ def llama_get_embeddings_ith(
 
 # // Get the embeddings for a sequence id
 # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
 # // otherwise: float[n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 @ctypes_function(
@@ -2665,7 +2921,6 @@ def llama_get_embeddings_seq(
 # // Vocab
 # //
 
-
 # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
     "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
@@ -2719,8 +2974,6 @@ def llama_vocab_is_control(
 
 
 # // Special tokens
-
-
 # LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
 @ctypes_function("llama_vocab_bos", [llama_vocab_p_ctypes], llama_token)
 def llama_vocab_bos(vocab: llama_vocab_p, /) -> llama_token:
@@ -2762,6 +3015,7 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
     """padding"""
     ...
 
+
 # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_vocab_get_add_bos",
@@ -2782,6 +3036,16 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
     ...
 
 
+# LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_get_add_sep",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
+    ...
+
+
 # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_vocab_fim_pre",
@@ -2842,7 +3106,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 
-
+# DEPRECATED functions
 # DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
 @ctypes_function(
     "llama_token_get_text",
@@ -3056,11 +3320,11 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
 # // The API is thread-safe.
 # //
 
-
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 # /// @return Returns the number of tokens on success, no more than n_tokens_max
 # /// @return Returns a negative number on failure - the number of tokens that would have been returned
+# /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
 # /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
 # /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
 # ///                      as plaintext. Does not insert a leading space.
@@ -3103,7 +3367,7 @@ def llama_tokenize(
         text_len: The length of the text.
         tokens: The tokens pointer must be large enough to hold the resulting tokens.
         n_max_tokens: The maximum number of tokens to return.
-        add_special: Allow adding special tokenns if the model is configured to do so.
+        add_special: Allow adding special tokens if the model is configured to do so.
         parse_special: Allow parsing special tokens.
 
     Returns:
@@ -3161,23 +3425,6 @@ def llama_token_to_piece(
     ...
 
 
-# # // check if token0 is contained as a prefix in token1
-# # LLAMA_API bool llama_token_is_prefix(
-# #           const struct llama_model * model,
-# #                        llama_token   token0,
-# #                        llama_token   token1);
-# @ctypes_function(
-#     "llama_token_is_prefix",
-#     [llama_model_p_ctypes, llama_token, llama_token],
-#     ctypes.c_bool,
-# )
-# def llama_token_is_prefix(
-#     model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
-# ) -> bool:
-#     """Check if token0 is contained as a prefix in token1"""
-#     ...
-
-
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
 # /// @param text The char pointer must be large enough to hold the resulting text.
 # /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -3185,7 +3432,7 @@ def llama_token_to_piece(
 # /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
 # /// @param unparse_special If true, special tokens are rendered in the output.
 # LLAMA_API int32_t llama_detokenize(
-#     const struct llama_model * model,
+#     const struct llama_vocab * vocab,
 #            const llama_token * tokens,
 #                      int32_t   n_tokens,
 #                         char * text,
@@ -3195,7 +3442,7 @@ def llama_token_to_piece(
 @ctypes_function(
     "llama_detokenize",
     [
-        llama_model_p_ctypes,
+        llama_vocab_p_ctypes,
         ctypes.POINTER(llama_token),
         ctypes.c_int32,
         ctypes.c_char_p,
@@ -3206,7 +3453,7 @@ def llama_token_to_piece(
     ctypes.c_int32,
 )
 def llama_detokenize(
-    model: llama_model_p,
+    vocab: llama_vocab_p,
     tokens: CtypesArray[llama_token],
     n_tokens: Union[ctypes.c_int, int],
     text: bytes,
@@ -3218,7 +3465,7 @@ def llama_detokenize(
     """Convert the provided tokens into text (inverse of llama_tokenize()).
 
     Args:
-        model: The model to use for tokenization.
+        vocab: The vocabulary to use for tokenization.
         tokens: The tokens to convert.
         n_tokens: The number of tokens.
         text: The buffer to write the text to.
@@ -3232,11 +3479,10 @@ def llama_detokenize(
 # // Chat templates
 # //
 
-
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model's default chat template will be used instead.
 # /// @param chat Pointer to a list of multiple llama_chat_message
 # /// @param n_msg Number of llama_chat_message in this chat
 # /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
@@ -3318,41 +3564,6 @@ def llama_chat_builtin_templates(
 # //
 # // Sampling API
 # //
-# // Sample usage:
-# //
-# //    // prepare the sampling chain at the start
-# //    auto sparams = llama_sampler_chain_default_params();
-# //
-# //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-# //
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
-# //
-# //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
-# //    // this sampler will be responsible to select the actual token
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
-# //
-# //    ...
-# //
-# //    // decoding loop:
-# //    while (...) {
-# //        ...
-# //
-# //        llama_decode(ctx, batch);
-# //
-# //        // sample from the logits of the last token in the batch
-# //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
-# //
-# //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
-# //        llama_sampler_accept(smpl, id);
-# //        ...
-# //    }
-# //
-# //    llama_sampler_free(smpl);
-# //
-# // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
-# //
 
 # typedef void * llama_sampler_context_t;
 llama_sampler_context_t = ctypes.c_void_p
@@ -3366,7 +3577,7 @@ def llama_chat_builtin_templates(
 #     void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
 #     struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 #     void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-#
+
 #     // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
 #     //void (*apply_ggml) (struct llama_sampler * smpl, ...);
 # };
@@ -3375,8 +3586,8 @@ class llama_sampler_i(ctypes.Structure):
 
 
 # struct llama_sampler {
-#     struct llama_sampler_i  * iface;
-#     llama_sampler_context_t   ctx;
+#     const struct llama_sampler_i * iface;
+#     llama_sampler_context_t        ctx;
 # };
 class llama_sampler(ctypes.Structure):
     _fields_ = [
@@ -3410,6 +3621,18 @@ class llama_sampler(ctypes.Structure):
 
 
 # // mirror of llama_sampler_i:
+# LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+@ctypes_function(
+    "llama_sampler_init",
+    [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init(
+    iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
+) -> llama_sampler_p:
+    ...
+
+
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
 @ctypes_function(
     "llama_sampler_name",
@@ -3475,7 +3698,7 @@ def llama_sampler_free(smpl: llama_sampler_p, /):
 
 # // llama_sampler_chain
 # // a type of llama_sampler that can chain multiple samplers one after another
-#
+
 # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
 @ctypes_function(
     "llama_sampler_chain_init",
@@ -3533,7 +3756,7 @@ def llama_sampler_chain_remove(
 
 
 # // available samplers:
-#
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
 def llama_sampler_init_greedy() -> llama_sampler_p:
@@ -3549,13 +3772,14 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+#     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
 def llama_sampler_init_softmax() -> llama_sampler_p:
     ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
 @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
 def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
@@ -3573,7 +3797,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
-# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+# /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
 # LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
 @ctypes_function(
     "llama_sampler_init_min_p",
@@ -3595,6 +3819,7 @@ def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
+# /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
 # LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes)
 def llama_sampler_init_temp(t: float) -> llama_sampler_p:
@@ -3627,12 +3852,18 @@ def llama_sampler_init_xtc(
     ...
 
 
+# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
+# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
+@ctypes_function(
+    "llama_sampler_init_top_n_sigma",
+    [ctypes.c_float],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
+    ...
+
+
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
 #                          int32_t   n_vocab,
 #                         uint32_t   seed,
@@ -3651,10 +3882,6 @@ def llama_sampler_init_mirostat(
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
 #                         uint32_t   seed,
 #                            float   tau,
@@ -3670,6 +3897,7 @@ def llama_sampler_init_mirostat_v2(
     ...
 
 
+# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
 # LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
 #         const struct llama_vocab * vocab,
 #                       const char * grammar_str,
@@ -3685,6 +3913,76 @@ def llama_sampler_init_grammar(
     ...
 
 
+# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+#         const struct llama_vocab * vocab,
+#                       const char * grammar_str,
+#                       const char * grammar_root,
+#                      const char ** trigger_words,
+#                             size_t num_trigger_words,
+#                const llama_token * trigger_tokens,
+#                             size_t num_trigger_tokens),
+#     "use llama_sampler_init_grammar_lazy_patterns instead");
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        ctypes.POINTER(llama_token),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_words: CtypesArray[bytes],
+    num_trigger_words: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
+) -> llama_sampler_p:
+    ...
+
+
+# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+#     const struct llama_vocab * vocab,
+#                   const char * grammar_str,
+#                   const char * grammar_root,
+#                  const char ** trigger_patterns,
+#                         size_t num_trigger_patterns,
+#            const llama_token * trigger_tokens,
+#                         size_t num_trigger_tokens);
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy_patterns",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        ctypes.POINTER(llama_token),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy_patterns(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_patterns: CtypesArray[bytes],
+    num_trigger_patterns: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
+) -> llama_sampler_p:
+    ...
+
+
 # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
 # LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
 #                          int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -3737,7 +4035,7 @@ def llama_sampler_init_dry(
     dry_base: float,
     dry_allowed_length: int,
     dry_penalty_last_n: int,
-    seq_breakers: CtypesArray[bytes],
+    seq_breakers,
     num_breakers: int,
     /,
 ) -> llama_sampler_p:
@@ -3760,26 +4058,6 @@ def llama_sampler_init_logit_bias(
 
 
 # // this sampler is meant to be used for fill-in-the-middle infilling
-# // it's supposed to be used after top_k + top_p sampling
-# //
-# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-# // 2. combine probs of tokens that have the same prefix
-# //
-# // example:
-# //
-# // - before:
-# //   "hel":   0.5
-# //   "hell":  0.2
-# //   "hello": 0.1
-# //   "dummy": 0.1
-# //
-# // - after:
-# //   "hel":   0.8
-# //   "dummy": 0.1
-# //
-# // 3. discard non-EOG tokens with low prob
-# // 4. if no tokens are left -> pick EOT
-# //
 # LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_sampler_init_infill",
@@ -3802,15 +4080,6 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
 
 
 # /// @details Sample and accept a token from the idx-th output of the last evaluation
-# //
-# // Shorthand for:
-# //    const auto * logits = llama_get_logits_ith(ctx, idx);
-# //    llama_token_data_array cur_p = { ... init from logits ... };
-# //    llama_sampler_apply(smpl, &cur_p);
-# //    auto token = cur_p.data[cur_p.selected].id;
-# //    llama_sampler_accept(smpl, token);
-# //    return token;
-# // Returns the sampled token
 # LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
 @ctypes_function(
     "llama_sampler_sample",
@@ -3827,10 +4096,7 @@ def llama_sampler_sample(
 # // Model split
 # //
 
-
 # /// @details Build a split GGUF final path for this chunk.
-# ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
-# //  Returns the split_path length.
 # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
 @ctypes_function(
     "llama_split_path",
@@ -3850,8 +4116,6 @@ def llama_split_path(
 
 
 # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-# ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-# //  Returns the split_prefix length.
 # LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
 @ctypes_function(
     "llama_split_prefix",
@@ -3899,16 +4163,13 @@ def llama_log_set(
 # //
 # // Performance utils
 # //
-# // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
-# //
-
 
 # struct llama_perf_context_data {
 #     double t_start_ms;
 #     double t_load_ms;
 #     double t_p_eval_ms;
 #     double t_eval_ms;
-#
+
 #     int32_t n_p_eval;
 #     int32_t n_eval;
 # };
@@ -3925,7 +4186,7 @@ class llama_perf_context_data(ctypes.Structure):
 
 # struct llama_perf_sampler_data {
 #     double t_sample_ms;
-#
+
 #     int32_t n_sample;
 # };
 class llama_perf_sampler_data(ctypes.Structure):
@@ -3996,3 +4257,83 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     ...
 
 
+# //
+# // training
+# //
+
+# // function that returns whether or not a given tensor contains trainable parameters
+# typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
+llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
+
+# // always returns true
+# LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+@ctypes_function(
+    "llama_opt_param_filter_all",
+    [ctypes.c_void_p, ctypes.c_void_p],
+    ctypes.c_bool,
+)
+def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool:
+    ...
+
+
+# struct llama_opt_params {
+#     uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+#     llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+#     void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
+
+#     ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+#     void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+# };
+class llama_opt_params(ctypes.Structure):
+    _fields_ = [
+        ("n_ctx_train", ctypes.c_uint32),
+        ("param_filter", llama_opt_param_filter),
+        ("param_filter_ud", ctypes.c_void_p),
+        ("get_opt_pars", ctypes.c_void_p),  # ggml_opt_get_optimizer_params - not implemented here
+        ("get_opt_pars_ud", ctypes.c_void_p),
+    ]
+
+
+# LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
+@ctypes_function(
+    "llama_opt_init",
+    [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params],
+    None,
+)
+def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /):
+    ...
+
+
+# LLAMA_API void llama_opt_epoch(
+#         struct llama_context    * lctx,
+#         ggml_opt_dataset_t        dataset,
+#         ggml_opt_result_t         result_train,
+#         ggml_opt_result_t         result_eval,
+#         int64_t                   idata_split,
+#         ggml_opt_epoch_callback   callback_train,
+#         ggml_opt_epoch_callback   callback_eval);
+@ctypes_function(
+    "llama_opt_epoch",
+    [
+        llama_context_p_ctypes,
+        ctypes.c_void_p,  # ggml_opt_dataset_t
+        ctypes.c_void_p,  # ggml_opt_result_t  
+        ctypes.c_void_p,  # ggml_opt_result_t
+        ctypes.c_int64,
+        ctypes.c_void_p,  # ggml_opt_epoch_callback
+        ctypes.c_void_p,  # ggml_opt_epoch_callback
+    ],
+    None,
+)
+def llama_opt_epoch(
+    lctx: llama_context_p,
+    dataset: ctypes.c_void_p,
+    result_train: ctypes.c_void_p,
+    result_eval: ctypes.c_void_p,
+    idata_split: int,
+    callback_train: ctypes.c_void_p,
+    callback_eval: ctypes.c_void_p,
+    /,
+):
+    ...
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
new file mode 100644
index 000000000..a45f8f406
--- /dev/null
+++ b/llama_cpp/mtmd_cpp.py
@@ -0,0 +1,280 @@
+from __future__ import annotations
+
+import os
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_uint8,
+    c_uint32,
+    c_float,
+    c_void_p,
+    c_size_t,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    byref,
+)
+import pathlib
+from typing import (
+    Union,
+    NewType,
+    Optional,
+    TYPE_CHECKING,
+)
+
+import llama_cpp.llama_cpp as llama_cpp
+
+from llama_cpp._ctypes_extensions import (
+    load_shared_library,
+    ctypes_function_for_shared_library,
+)
+
+if TYPE_CHECKING:
+    from llama_cpp._ctypes_extensions import (
+        CtypesArray,
+    )
+
+
+# Specify the base name of the shared library to load
+_libmtmd_base_name = "mtmd"
+_libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
+_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+
+# Load the library
+_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
+
+ctypes_function = ctypes_function_for_shared_library(_libmtmd)
+
+################################################
+# mtmd.h types
+################################################
+
+# Opaque types
+mtmd_context_p = NewType("mtmd_context_p", int)
+mtmd_context_p_ctypes = c_void_p
+
+mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
+mtmd_bitmap_p_ctypes = c_void_p
+
+mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
+mtmd_image_tokens_p_ctypes = c_void_p
+
+mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
+mtmd_input_chunk_p_ctypes = c_void_p
+
+mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
+mtmd_input_chunks_p_ctypes = c_void_p
+
+# Enums
+MTMD_INPUT_CHUNK_TYPE_TEXT = 0
+MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
+MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
+
+# Structures
+class mtmd_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("verbosity", c_int),  # ggml_log_level
+        ("image_marker", c_char_p),
+        ("media_marker", c_char_p),
+    ]
+
+class mtmd_input_text(Structure):
+    _fields_ = [
+        ("text", c_char_p),
+        ("add_special", c_bool),
+        ("parse_special", c_bool),
+    ]
+
+################################################
+# mtmd.h functions
+################################################
+
+# MTMD_API const char * mtmd_default_marker(void);
+@ctypes_function("mtmd_default_marker", [], c_char_p)
+def mtmd_default_marker() -> bytes:
+    ...
+
+# MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+@ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
+def mtmd_context_params_default() -> mtmd_context_params:
+    ...
+
+# MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+#                                             const struct llama_model * text_model,
+#                                             const struct mtmd_context_params ctx_params);
+@ctypes_function(
+    "mtmd_init_from_file",
+    [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
+    mtmd_context_p_ctypes
+)
+def mtmd_init_from_file(
+    mmproj_fname: bytes,
+    text_model: llama_cpp.llama_model_p,
+    ctx_params: mtmd_context_params,
+    /,
+) -> Optional[mtmd_context_p]:
+    ...
+
+# MTMD_API void mtmd_free(mtmd_context * ctx);
+@ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
+def mtmd_free(ctx: mtmd_context_p, /):
+    ...
+
+# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+@ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
+    ...
+
+# MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
+@ctypes_function(
+    "mtmd_bitmap_init",
+    [c_uint32, c_uint32, POINTER(c_uint8)],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_bitmap_init(
+    nx: Union[c_uint32, int],
+    ny: Union[c_uint32, int],
+    data: CtypesArray[c_uint8],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+
+# MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
+    ...
+
+# MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
+@ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
+    ...
+
+# MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
+    ...
+
+# MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+
+# MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
+@ctypes_function(
+    "mtmd_input_chunks_get",
+    [mtmd_input_chunks_p_ctypes, c_size_t],
+    mtmd_input_chunk_p_ctypes
+)
+def mtmd_input_chunks_get(
+    chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
+) -> Optional[mtmd_input_chunk_p]:
+    ...
+
+# MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+#                                mtmd_input_chunks * output,
+#                                const mtmd_input_text * text,
+#                                const mtmd_bitmap ** bitmaps,
+#                                size_t n_bitmaps);
+@ctypes_function(
+    "mtmd_tokenize",
+    [
+        mtmd_context_p_ctypes,
+        mtmd_input_chunks_p_ctypes,
+        POINTER(mtmd_input_text),
+        POINTER(mtmd_bitmap_p_ctypes),
+        c_size_t,
+    ],
+    c_int,
+)
+def mtmd_tokenize(
+    ctx: mtmd_context_p,
+    output: mtmd_input_chunks_p,
+    text: "_Pointer[mtmd_input_text]",
+    bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
+    n_bitmaps: Union[c_size_t, int],
+    /,
+) -> int:
+    ...
+
+# MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+
+# MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+
+# MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+@ctypes_function(
+    "mtmd_input_chunk_get_tokens_text",
+    [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
+    POINTER(llama_cpp.llama_token)
+)
+def mtmd_input_chunk_get_tokens_text(
+    chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
+) -> Optional["_Pointer[llama_cpp.llama_token]"]:
+    ...
+
+################################################
+# mtmd-helper.h functions
+################################################
+
+# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+@ctypes_function(
+    "mtmd_helper_bitmap_init_from_buf",
+    [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_helper_bitmap_init_from_buf(
+    ctx: mtmd_context_p,
+    buf: CtypesArray[c_uint8],
+    length: Union[c_size_t, int],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+
+# MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+
+# MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+#                                                struct llama_context * lctx,
+#                                                const mtmd_input_chunk * chunk,
+#                                                llama_pos n_past,
+#                                                llama_seq_id seq_id,
+#                                                int32_t n_batch,
+#                                                bool logits_last,
+#                                                llama_pos * new_n_past);
+@ctypes_function(
+    "mtmd_helper_eval_chunk_single",
+    [
+        mtmd_context_p_ctypes,
+        llama_cpp.llama_context_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+        llama_cpp.llama_pos,
+        llama_cpp.llama_seq_id,
+        c_int,
+        c_bool,
+        POINTER(llama_cpp.llama_pos),
+    ],
+    c_int,
+)
+def mtmd_helper_eval_chunk_single(
+    ctx: mtmd_context_p,
+    lctx: llama_cpp.llama_context_p,
+    chunk: mtmd_input_chunk_p,
+    n_past: llama_cpp.llama_pos,
+    seq_id: llama_cpp.llama_seq_id,
+    n_batch: Union[c_int, int],
+    logits_last: Union[c_bool, bool],
+    new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    /,
+) -> int:
+    ...
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index c6716f919..11bd363b5 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
+        elif settings.chat_format == "qwen2.5-vl":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
                 settings.hf_pretrained_model_name_or_path is not None
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 794fe23f2..8846aace4 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 794fe23f29fb40104975c91fe19f23798f7c726e
+Subproject commit 8846aace4934ad29651ea61b8c7e3f6b0556e3d2