diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 7307c85ab..7eaf017fb 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, windows-2019, macos-13]
+        os: [ubuntu-22.04, windows-2022, macos-14, macos-15]
 
     steps:
       - uses: actions/checkout@v4
@@ -74,6 +74,7 @@ jobs:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "aarch64"
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON"
           CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
         with:
           output-dir: wheelhouse
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index b5c7346db..b290f6273 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -9,7 +9,7 @@ permissions:
 jobs:
   docker:
     name: Build and push Docker image
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 3d410148f..07b30cfc0 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -8,7 +8,7 @@ permissions:
 jobs:
   define_matrix:
     name: Define Build Matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     defaults:
@@ -20,7 +20,7 @@ jobs:
         id: set-matrix
         run: |
           $matrix = @{
-              'os' = @('ubuntu-latest', 'windows-2019')
+              'os' = @('ubuntu-22.04') #, 'windows-2022')
               'pyver' = @("3.9", "3.10", "3.11", "3.12")
               'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1")
               'releasetag' = @("basic")
@@ -61,11 +61,9 @@ jobs:
       - name: Setup Mamba
         uses: conda-incubator/setup-miniconda@v3.1.0
         with:
-          activate-environment: "build"
+          activate-environment: "llamacpp"
           python-version: ${{ matrix.pyver }}
-          miniforge-variant: Mambaforge
           miniforge-version: latest
-          use-mamba: true
           add-pip-as-python-dependency: true
           auto-activate-base: false
 
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index 9b97bf2f5..98f511e4a 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [macos-13, macos-14, macos-15]
+        os: [macos-14, macos-15]
 
     steps:
       - uses: actions/checkout@v4
@@ -23,32 +23,21 @@ jobs:
         with:
           python-version: "3.12"
           cache: 'pip'
-          
+
       - name: Install dependencies (Linux/MacOS)
-        if: runner.os != 'Windows'
         run: |
           python -m pip install --upgrade pip
           python -m pip install uv
           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
         shell: bash
 
-      - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
-        env:
-          RUST_LOG: trace        
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          python -m uv pip install -e .[all] --verbose
-        shell: cmd
-
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.22.0
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "arm64"
-          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on"
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on -DCMAKE_CROSSCOMPILING=ON"
           CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
         with:
           package-dir: .
@@ -69,7 +58,7 @@ jobs:
         with:
           merge-multiple: true
           path: dist2
-          
+
       - uses: softprops/action-gh-release@v2
         with:
           files: dist2/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b4f49c904..6017812bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.12]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@a0374a67e2924f2e845cdc59dd67d9a44065a89c
+
+## [0.3.11]
+
+- fix: Update reference to `llama_kv_cache_clear` in Llama.embed. Closes #2037 by @abetlen in 9e5a4eaa84156084ed7bbb91e6efcc91dc6217bc
+
+## [0.3.10]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8846aace4934ad29651ea61b8c7e3f6b0556e3d2
+- feat: Add support for llama.cpp multimodal, add Qwen2.5-VL chat handler by @abetlen in cd548bd0f14210627798237d5c2ea78acfb88ccb
+
+## [0.3.9]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c
+
+## [0.3.8]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698
+
+## [0.3.7]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@794fe23f29fb40104975c91fe19f23798f7c726e
+- fix(ci): Fix the CUDA workflow by @oobabooga in #1894
+- fix: error showing time spent in llama perf context print, adds `no_perf` flag to `Llama` class by @shakalaca in #1898
+
 ## [0.3.6]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@f7cd13301c2a88f97073fd119072b4cc92c08df1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64a0304a1..4b06d98b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,9 @@ if (LLAMA_BUILD)
     # Enable building of the common library
     set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
 
+    # Disable building curl support
+    set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)
+
     # Architecture detection and settings for Apple platforms
     if (APPLE)
         # Get the target architecture
@@ -93,7 +96,15 @@ if (LLAMA_BUILD)
         set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE)
     endif()
 
+
     add_subdirectory(vendor/llama.cpp)
+
+    if (WIN32)
+        if (TARGET llama)
+            set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+        endif()
+    endif()
+
     llama_cpp_python_install_target(llama)
     llama_cpp_python_install_target(ggml)
 
@@ -143,35 +154,34 @@ if (LLAMA_BUILD)
         endif()
 
         # Building llava
-        add_subdirectory(vendor/llama.cpp/examples/llava)
-        set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+        add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)
-            set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+            set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
-        llama_cpp_python_install_target(llava_shared)
+        llama_cpp_python_install_target(mtmd)
         if (WIN32)
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
             )
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
             )
         endif()
 
-        # Fix for llava build: Add include directory for llama.h
+        # Fix for mtmd build: Add include directory for llama.h
         # Move these commands after the add_subdirectory call
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
 
         if (BUILD_SHARED_LIBS)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+            target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+            target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
         endif()
 
-        target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        # target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        # target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
     endif()
 endif()
diff --git a/README.md b/README.md
index e00456580..088a23779 100644
--- a/README.md
+++ b/README.md
@@ -505,6 +505,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
+| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index 3594df1a5..06483d44e 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -9,6 +9,7 @@ ARG IMAGE
 
 # Update and upgrade the existing packages 
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    git \
     python3 \
     python3-pip \
     ninja-build \
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 8c6118fb4..b16bb7dc9 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.6"
+__version__ = "0.3.12"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 8fa2b447f..18d733481 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -9,6 +9,8 @@
     Tuple,
     Optional,
     Sequence,
+    Callable,
+    Union,
 )
 from dataclasses import dataclass, field
 from contextlib import ExitStack
@@ -48,43 +50,55 @@ def __init__(
             raise ValueError(f"Model path does not exist: {path_model}")
 
         with suppress_stdout_stderr(disable=verbose):
-            model = llama_cpp.llama_load_model_from_file(
+            model = llama_cpp.llama_model_load_from_file(
                 self.path_model.encode("utf-8"), self.params
             )
 
         if model is None:
             raise ValueError(f"Failed to load model from file: {path_model}")
 
+        vocab = llama_cpp.llama_model_get_vocab(model)
+
+        if vocab is None:
+            raise ValueError(f"Failed to get vocab from model: {path_model}")
+
         self.model = model
+        self.vocab = vocab
+        self.sampler = None  # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
 
         def free_model():
             if self.model is None:
                 return
-            llama_cpp.llama_free_model(self.model)
+            llama_cpp.llama_model_free(self.model)
             self.model = None
 
         self._exit_stack.callback(free_model)
 
     def close(self):
+        if self.sampler is not None:
+            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+            for i, _ in reversed(self.custom_samplers):
+                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+            self.custom_samplers.clear()
         self._exit_stack.close()
 
     def __del__(self):
         self.close()
 
     def vocab_type(self) -> int:
-        return llama_cpp.llama_vocab_type(self.model)
+        return llama_cpp.llama_vocab_type(self.vocab)
 
     def n_vocab(self) -> int:
-        return llama_cpp.llama_n_vocab(self.model)
+        return llama_cpp.llama_vocab_n_tokens(self.vocab)
 
     def n_ctx_train(self) -> int:
-        return llama_cpp.llama_n_ctx_train(self.model)
+        return llama_cpp.llama_model_n_ctx_train(self.model)
 
     def n_embd(self) -> int:
-        return llama_cpp.llama_n_embd(self.model)
+        return llama_cpp.llama_model_n_embd(self.model)
 
     def rope_freq_scale_train(self) -> float:
-        return llama_cpp.llama_rope_freq_scale_train(self.model)
+        return llama_cpp.llama_model_rope_freq_scale_train(self.model)
 
     def desc(self) -> str:
         buf = ctypes.create_string_buffer(1024)
@@ -98,53 +112,53 @@ def n_params(self) -> int:
         return llama_cpp.llama_model_n_params(self.model)
 
     def get_tensor(self, name: str) -> ctypes.c_void_p:
-        return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
+        raise NotImplementedError("get_tensor is not implemented in llama.cpp")
 
     # Vocab
 
     def token_get_text(self, token: int) -> str:
-        return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
+        return llama_cpp.llama_vocab_get_text(self.vocab, token).decode("utf-8")
 
     def token_get_score(self, token: int) -> float:
-        return llama_cpp.llama_token_get_score(self.model, token)
+        return llama_cpp.llama_vocab_get_score(self.vocab, token)
 
     def token_get_attr(self, token: int) -> int:
-        return llama_cpp.llama_token_get_attr(self.model, token)
+        return llama_cpp.llama_vocab_get_attr(self.vocab, token)
 
     # Special tokens
 
     def token_bos(self) -> int:
-        return llama_cpp.llama_token_bos(self.model)
+        return llama_cpp.llama_vocab_bos(self.vocab)
 
     def token_eos(self) -> int:
-        return llama_cpp.llama_token_eos(self.model)
+        return llama_cpp.llama_vocab_eos(self.vocab)
 
     def token_cls(self) -> int:
-        return llama_cpp.llama_token_cls(self.model)
+        return llama_cpp.llama_vocab_cls(self.vocab)
 
     def token_sep(self) -> int:
-        return llama_cpp.llama_token_sep(self.model)
+        return llama_cpp.llama_vocab_sep(self.vocab)
 
     def token_nl(self) -> int:
-        return llama_cpp.llama_token_nl(self.model)
+        return llama_cpp.llama_vocab_nl(self.vocab)
 
     def token_prefix(self) -> int:
-        return llama_cpp.llama_token_prefix(self.model)
+        return llama_cpp.llama_vocab_fim_pre(self.vocab)
 
     def token_middle(self) -> int:
-        return llama_cpp.llama_token_middle(self.model)
+        return llama_cpp.llama_vocab_fim_mid(self.vocab)
 
     def token_suffix(self) -> int:
-        return llama_cpp.llama_token_suffix(self.model)
+        return llama_cpp.llama_vocab_fim_suf(self.vocab)
 
     def token_eot(self) -> int:
-        return llama_cpp.llama_token_eot(self.model)
+        return llama_cpp.llama_vocab_eot(self.vocab)
 
     def add_bos_token(self) -> bool:
-        return llama_cpp.llama_add_bos_token(self.model)
+        return llama_cpp.llama_vocab_get_add_bos(self.vocab)
 
     def add_eos_token(self) -> bool:
-        return llama_cpp.llama_add_eos_token(self.model)
+        return llama_cpp.llama_vocab_get_add_eos(self.vocab)
 
     # Tokenization
 
@@ -152,13 +166,13 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):
         n_ctx = self.n_ctx_train()
         tokens = (llama_cpp.llama_token * n_ctx)()
         n_tokens = llama_cpp.llama_tokenize(
-            self.model, text, len(text), tokens, n_ctx, add_bos, special
+            self.vocab, text, len(text), tokens, n_ctx, add_bos, special
         )
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
             tokens = (llama_cpp.llama_token * n_tokens)()
             n_tokens = llama_cpp.llama_tokenize(
-                self.model, text, len(text), tokens, n_tokens, add_bos, special
+                self.vocab, text, len(text), tokens, n_tokens, add_bos, special
             )
             if n_tokens < 0:
                 raise RuntimeError(
@@ -168,7 +182,7 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):
 
     def token_to_piece(self, token: int, special: bool = False) -> bytes:
         buf = ctypes.create_string_buffer(32)
-        llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
+        llama_cpp.llama_token_to_piece(self.vocab, token, buf, 32, 0, special)
         return bytes(buf)
 
     def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
@@ -177,7 +191,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         buffer = (ctypes.c_char * size)()
         for token in tokens:
             n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size, 0, special
+                self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special
             )
             assert n <= size
             output += bytes(buffer[:n])
@@ -243,12 +257,14 @@ def __init__(
         self.verbose = verbose
         self._exit_stack = ExitStack()
 
-        ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params)
+        ctx = llama_cpp.llama_init_from_model(self.model.model, self.params)
 
         if ctx is None:
             raise ValueError("Failed to create llama_context")
 
         self.ctx = ctx
+        self.memory = llama_cpp.llama_get_memory(self.ctx)
+        self.sampler = None  # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
 
         def free_ctx():
             if self.ctx is None:
@@ -271,22 +287,22 @@ def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
     def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+        llama_cpp.llama_memory_clear(self.memory, True)
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+        llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+        llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+        llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
-        llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
+        llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:
-        return llama_cpp.llama_get_state_size(self.ctx)
+        return llama_cpp.llama_state_get_size(self.ctx)
 
     # TODO: copy_state_data
 
@@ -304,6 +320,14 @@ def decode(self, batch: LlamaBatch):
         if return_code != 0:
             raise RuntimeError(f"llama_decode returned {return_code}")
 
+    def encode(self, batch: LlamaBatch):
+        return_code = llama_cpp.llama_encode(
+            self.ctx,
+            batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_encode returned {return_code}")
+
     def set_n_threads(self, n_threads: int, n_threads_batch: int):
         llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
 
@@ -316,11 +340,16 @@ def get_logits_ith(self, i: int):
     def get_embeddings(self):
         return llama_cpp.llama_get_embeddings(self.ctx)
 
-    # Sampling functions
+    def get_embeddings_ith(self, i: int):
+        return llama_cpp.llama_get_embeddings_ith(self.ctx, i)
+
+    def get_embeddings_seq(self, seq_id: int):
+        return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id)
+
+    # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        # TODO: Fix
-        llama_cpp.llama_set_rng_seed(self.ctx, seed)
+        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
 
     def sample_repetition_penalties(
         self,
@@ -331,55 +360,30 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        llama_cpp.llama_sample_repetition_penalties(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-            last_tokens_data,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-        )
+        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        llama_cpp.llama_sample_softmax(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-        )
+        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        llama_cpp.llama_sample_top_k(
-            self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
-        )
+        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        llama_cpp.llama_sample_top_p(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        )
+        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        llama_cpp.llama_sample_min_p(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        )
+        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        llama_cpp.llama_sample_typical(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
-        )
+        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
-        llama_cpp.llama_sample_temp(
-            self.ctx, llama_cpp.byref(candidates.candidates), temp
-        )
+        raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        llama_cpp.llama_sample_grammar(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-            grammar.grammar,
-        )
+        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat(
         self,
@@ -389,14 +393,7 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        return llama_cpp.llama_sample_token_mirostat(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-            tau,
-            eta,
-            m,
-            mu,
-        )
+        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat_v2(
         self,
@@ -405,29 +402,17 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        return llama_cpp.llama_sample_token_mirostat_v2(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-            tau,
-            eta,
-            mu,
-        )
+        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        return llama_cpp.llama_sample_token_greedy(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-        )
+        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        return llama_cpp.llama_sample_token(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-        )
+        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)
+        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -458,6 +443,7 @@ def __init__(
             raise ValueError("Failed to create llama_batch")
 
         self.batch = batch
+        self.sampler = None  # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
 
         def free_batch():
             if self.batch is None:
@@ -520,6 +506,7 @@ def __init__(self, *, n_vocab: int):
         )
         self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)  # type: ignore
         self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
+        self.sampler = None  # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
 
     def copy_logits(self, logits: npt.NDArray[np.single]):
         self.candidates_data.id[:] = self.default_candidates_data_id
@@ -608,103 +595,16 @@ def sample(
         idx: int = 0,
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
-        n_vocab = ctx_main.model.n_vocab()
-        id: int = 0
-
-        if logits_array is None:
-            logits = ctx_main.get_logits_ith(idx)
-            logits_array = np.array(
-                ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents,
-                dtype=np.single,
-            )
-
-        # apply logit_bias
-        for token, logit_bias in self.params.logit_bias.items():
-            logits_array[token] += logit_bias
-
-        token_data_array = LlamaTokenDataArray(
-            n_vocab=n_vocab
-        )  # TODO: Only create this once
-        token_data_array.copy_logits(logits_array)
-
-        # apply penalties
-        if len(self.prev) > 0:
-            nl_token = ctx_main.model.token_nl()
-            nl_logit = logits_array[nl_token]
-            last_tokens = self.prev[-self.params.penalty_last_n :]
-            last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
-            if last_tokens_size > 0:
-                last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
-                ctx_main.sample_repetition_penalties(
-                    token_data_array,
-                    last_tokens_p,
-                    last_tokens_size,
-                    self.params.penalty_repeat,
-                    self.params.penalty_freq,
-                    self.params.penalty_present,
-                )
-            if not self.params.penalize_nl:
-                token_data_array.candidates_data.logit[nl_token] = nl_logit
-
-        if self.grammar is not None:
-            ctx_main.sample_grammar(token_data_array, self.grammar)
-
-        if self.params.temp < 0:
-            ctx_main.sample_softmax(token_data_array)
-            id = token_data_array.candidates_data.id[0]
-        elif self.params.temp == 0:
-            id = ctx_main.sample_token_greedy(token_data_array)
-        else:
-            if self.params.mirostat == 1:
-                mirostat_m = 100
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    mirostat_m,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            elif self.params.mirostat == 2:
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat_v2(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            else:
-                min_keep = max(1, self.params.n_probs)
-                ctx_main.sample_top_k(
-                    token_data_array, self.params.top_k, min_keep=min_keep
-                )
-                ctx_main.sample_typical(
-                    token_data_array, self.params.typical_p, min_keep=min_keep
-                )
-                ctx_main.sample_top_p(
-                    token_data_array, self.params.top_p, min_keep=min_keep
-                )
-                ctx_main.sample_min_p(
-                    token_data_array, self.params.min_p, min_keep=min_keep
-                )
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token(token_data_array)
-        return id
+        # This method is deprecated in favor of using LlamaSampler directly
+        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
-        if apply_grammar and self.grammar is not None:
-            ctx_main.grammar_accept_token(self.grammar, id)
         self.prev.append(id)
 
 
-from typing import List, Callable, Optional, Union
-import ctypes
-import llama_cpp
-
-
 class CustomSampler:
     def __init__(
-        self, apply_func: typing.Callable[[llama_cpp.llama_token_data_array], None]
+        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
     ):
         self.apply_func = apply_func
 
@@ -737,72 +637,117 @@ def get_sampler(self) -> llama_cpp.llama_sampler_p:
 
 class LlamaSampler:
     def __init__(self):
-        params = llama_cpp.llama_sampler_chain_params()
+        params = llama_cpp.llama_sampler_chain_default_params()
         self.sampler = llama_cpp.llama_sampler_chain_init(params)
-        self.samplers: List[llama_cpp.llama_sampler_p] = []
         self.custom_samplers: List[Tuple[int, CustomSampler]] = []
+        self._exit_stack = ExitStack()
+
+        def free_sampler():
+            if self.sampler is not None:
+                # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+                for i, _ in reversed(self.custom_samplers):
+                    llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+                llama_cpp.llama_sampler_free(self.sampler)
+                self.sampler = None
+
+        self._exit_stack.callback(free_sampler)
+
+    def close(self):
+        self._exit_stack.close()
+
+    def __del__(self):
+        self.close()
 
     def add_greedy(self):
         sampler = llama_cpp.llama_sampler_init_greedy()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_dist(self, seed: int):
         sampler = llama_cpp.llama_sampler_init_dist(seed)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
         sampler = llama_cpp.llama_sampler_init_softmax()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_top_p(self, p: float, min_keep: int):
+    def add_top_p(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_min_p(self, p: float, min_keep: int):
+    def add_min_p(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_typical(self, p: float, min_keep: int):
+    def add_typical(self, p: float, min_keep: int = 1):
         sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_temp(self, temp: float):
         sampler = llama_cpp.llama_sampler_init_temp(temp)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_temp_ext(self, t: float, delta: float, exponent: float):
         sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_xtc(self, p: float, t: float, min_keep: int, seed: int):
+        sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_top_n_sigma(self, n: float):
+        sampler = llama_cpp.llama_sampler_init_top_n_sigma(n)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int):
         sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat_v2(self, seed: int, tau: float, eta: float):
         sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta)
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         sampler = llama_cpp.llama_sampler_init_grammar(
-            model.model, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
+            model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_grammar_lazy_patterns(
+        self, 
+        model: LlamaModel, 
+        grammar: LlamaGrammar,
+        trigger_patterns: List[str],
+        trigger_tokens: List[int]
+    ):
+        # Convert patterns to C array
+        pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
+        for i, pattern in enumerate(trigger_patterns):
+            pattern_ptrs[i] = pattern.encode("utf-8")
+        
+        # Convert tokens to C array
+        token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
+        
+        sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
+            model.vocab,
+            grammar._grammar.encode("utf-8"),
+            grammar._root.encode("utf-8"),
+            pattern_ptrs,
+            len(trigger_patterns),
+            token_array,
+            len(trigger_tokens)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_penalties(
         self,
-        n_vocab: int,
-        special_eos_id: int,
-        linefeed_id: int,
         penalty_last_n: int,
         penalty_repeat: float,
         penalty_freq: float,
         penalty_present: float,
-        penalize_nl: bool,
-        ignore_eos: bool,
     ):
         sampler = llama_cpp.llama_sampler_init_penalties(
             penalty_last_n,
@@ -810,49 +755,96 @@ def add_penalties(
             penalty_freq,
             penalty_present,
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_dry(
+        self,
+        model: LlamaModel,
+        n_ctx_train: int,
+        dry_multiplier: float,
+        dry_base: float,
+        dry_allowed_length: int,
+        dry_penalty_last_n: int,
+        seq_breakers: List[str]
+    ):
+        # Convert seq_breakers to C array
+        breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
+        for i, breaker in enumerate(seq_breakers):
+            breaker_ptrs[i] = breaker.encode("utf-8")
+        
+        sampler = llama_cpp.llama_sampler_init_dry(
+            model.vocab,
+            n_ctx_train,
+            dry_multiplier,
+            dry_base,
+            dry_allowed_length,
+            dry_penalty_last_n,
+            breaker_ptrs,
+            len(seq_breakers)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def init_logit_bias(
-        self, n_vocab: int, n_logit_bias, logit_bias: llama_cpp.llama_logit_bias_p
+    def add_logit_bias(
+        self, 
+        n_vocab: int, 
+        logit_bias: Dict[int, float]
     ):
+        # Convert logit_bias dict to C array
+        bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
+        for i, (token, bias) in enumerate(logit_bias.items()):
+            bias_array[i].token = token
+            bias_array[i].bias = bias
+        
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab, n_logit_bias, logit_bias
+            n_vocab,
+            len(logit_bias),
+            bias_array
         )
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+
+    def add_infill(self, model: LlamaModel):
+        sampler = llama_cpp.llama_sampler_init_infill(model.vocab)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_custom(
         self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
     ):
         custom_sampler = CustomSampler(apply_func)
         sampler = custom_sampler.get_sampler()
-        self._add_sampler(sampler)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
         # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
         self.custom_samplers.append(
             (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler)
         )
 
-    def _add_sampler(self, sampler: llama_cpp.llama_sampler_p):
-        assert self.sampler is not None
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
-        self.samplers.append(sampler)
-
     def get_seed(self) -> int:
-        assert self.sampler is not None
         return llama_cpp.llama_sampler_get_seed(self.sampler)
 
-    def sample(self, ctx: LlamaContext, idx: int) -> int:
-        assert self.sampler is not None
+    def sample(self, ctx: LlamaContext, idx: int = -1) -> int:
         return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)
 
-    def close(self):
-        if self.sampler:
-            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
-            for i, _ in reversed(self.custom_samplers):
-                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
-            llama_cpp.llama_sampler_free(self.sampler)
-            self.sampler = None
-        self.samplers.clear()
-        self.custom_samplers.clear()
+    def accept(self, token: int):
+        llama_cpp.llama_sampler_accept(self.sampler, token)
 
-    def __del__(self):
-        self.close()
+    def reset(self):
+        llama_cpp.llama_sampler_reset(self.sampler)
+
+    def clone(self):
+        # NOTE: Custom samplers cannot be cloned due to Python callback limitations
+        if self.custom_samplers:
+            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
+        
+        cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
+        # Create a new wrapper around the cloned sampler
+        new_sampler = LlamaSampler.__new__(LlamaSampler)
+        new_sampler.sampler = cloned_sampler
+        new_sampler.custom_samplers = []
+        new_sampler._exit_stack = ExitStack()
+        
+        def free_sampler():
+            if new_sampler.sampler is not None:
+                llama_cpp.llama_sampler_free(new_sampler.sampler)
+                new_sampler.sampler = None
+
+        new_sampler._exit_stack.callback(free_sampler)
+        return new_sampler
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2fd7ff193..2e93670e6 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -66,7 +66,6 @@ def __init__(
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
-        rpc_servers: Optional[str] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
@@ -93,7 +92,10 @@ def __init__(
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
+        op_offloat: Optional[bool] = None,
+        swa_full: Optional[bool] = None,
         # Sampling Params
+        no_perf: bool = False,
         last_n_tokens_size: int = 64,
         # LoRA Params
         lora_base: Optional[str] = None,
@@ -149,7 +151,6 @@ def __init__(
             split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
             main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
-            rpc_servers: Comma separated list of RPC servers to use for offloading
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
@@ -173,6 +174,9 @@ def __init__(
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
+            op_offloat: offload host tensor operations to device
+            swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+            no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
@@ -224,11 +228,6 @@ def __init__(
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
-        if rpc_servers is not None:
-            self.model_params.rpc_servers = rpc_servers.encode("utf-8")
-            self._rpc_servers = rpc_servers
-        else:
-            self._rpc_servers = None
         self.tensor_split = tensor_split
         self._c_tensor_split = None
         if self.tensor_split is not None:
@@ -339,18 +338,24 @@ def __init__(
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.logits_all = (
-            logits_all if draft_model is None else True
-        )  # Must be set to True for speculative decoding
+        self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
+
+        if op_offloat is not None:
+            self.context_params.op_offloat = op_offloat
+
+        if swa_full is not None:
+            self.context_params.swa_full = swa_full
+
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
         if type_v is not None:
             self.context_params.type_v = type_v
         # Sampling Params
+        self.context_params.no_perf = no_perf
         self.last_n_tokens_size = last_n_tokens_size
 
         self.cache: Optional[BaseLlamaCache] = None
@@ -406,10 +411,10 @@ def __init__(
             )
         )
 
-        self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None
+        self._lora_adapter: Optional[llama_cpp.llama_adapter_lora_p] = None
 
         if self.lora_path:
-            self._lora_adapter = llama_cpp.llama_lora_adapter_init(
+            self._lora_adapter = llama_cpp.llama_adapter_lora_init(
                 self._model.model,
                 self.lora_path.encode("utf-8"),
             )
@@ -421,12 +426,12 @@ def __init__(
             def free_lora_adapter():
                 if self._lora_adapter is None:
                     return
-                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
+                llama_cpp.llama_adapter_lora_free(self._lora_adapter)
                 self._lora_adapter = None
 
             self._stack.callback(free_lora_adapter)
 
-            if llama_cpp.llama_lora_adapter_set(
+            if llama_cpp.llama_set_adapter_lora(
                 self._ctx.ctx, self._lora_adapter, self.lora_scale
             ):
                 raise RuntimeError(
@@ -565,7 +570,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen=self._n_ctx if self.context_params.logits_all else 1,
+            maxlen=self._n_ctx if self._logits_all else 1,
         )
 
     def tokenize(
@@ -638,13 +643,13 @@ def eval(self, tokens: Sequence[int]):
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
+                batch=batch, n_past=n_past, logits_all=self._logits_all
             )
             self._ctx.decode(self._batch)
             # Save tokens
             self.input_ids[n_past : n_past + n_tokens] = batch
             # Save logits
-            if self.context_params.logits_all:
+            if self._logits_all:
                 rows = n_tokens
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
@@ -706,15 +711,15 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_custom(apply_func)
 
         sampler.add_penalties(
-            n_vocab=self._n_vocab,
-            special_eos_id=self._token_eos,
-            linefeed_id=self._token_nl,
+            # n_vocab=self._n_vocab,
+            # special_eos_id=self._token_eos,
+            # linefeed_id=self._token_nl,
             penalty_last_n=self.last_n_tokens_size,
             penalty_repeat=repeat_penalty,
             penalty_freq=frequency_penalty,
             penalty_present=presence_penalty,
-            penalize_nl=penalize_nl,
-            ignore_eos=False,
+            # penalize_nl=penalize_nl,
+            # ignore_eos=False,
         )
 
         if grammar is not None:
@@ -1036,7 +1041,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1107,7 +1112,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
         self.reset()
 
         if return_count:
@@ -1152,9 +1157,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = self._model.token_prefix()
-        middle_token_id: int = self._model.token_middle()
-        suffix_token_id: int = self._model.token_suffix()
+        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1285,7 +1290,7 @@ def logit_bias_processor(
         else:
             stop_sequences = []
 
-        if logprobs is not None and self.context_params.logits_all is False:
+        if logprobs is not None and self._logits_all is False:
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
@@ -1332,7 +1337,7 @@ def logit_bias_processor(
             logits_processor=logits_processor,
             grammar=grammar,
         ):
-            if llama_cpp.llama_token_is_eog(self._model.model, token):
+            if llama_cpp.llama_token_is_eog(self._model.vocab, token):
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
@@ -2088,11 +2093,14 @@ def __getstate__(self):
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            logits_all=self.context_params.logits_all,
+            logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
+            op_offloat=self.context_params.op_offloat,
+            swa_full=self.context_params.swa_full,
             # Sampling Params
+            no_perf=self.context_params.no_perf,
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params
             lora_base=self.lora_base,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 17575c700..a288db7b0 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -28,6 +28,7 @@
 import numpy as np
 import numpy.typing as npt
 
+import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama as llama
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_grammar as llama_grammar
@@ -2651,7 +2652,7 @@ def generate_streaming(tools, functions, function_call, prompt):
 
 class Llava15ChatHandler:
     DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
-        "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+        "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
     )
 
     CHAT_FORMAT = (
@@ -2690,70 +2691,72 @@ class Llava15ChatHandler:
     )
 
     def __init__(self, clip_model_path: str, verbose: bool = True):
-        import llama_cpp.llava_cpp as llava_cpp
+        import llama_cpp.mtmd_cpp as mtmd_cpp
 
         self.clip_model_path = clip_model_path
         self.verbose = verbose
-
-        self._llava_cpp = llava_cpp  # TODO: Fix
+        self._mtmd_cpp = mtmd_cpp
         self._exit_stack = ExitStack()
-        self._last_image_embed: Optional[
-            llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]
-        ] = None
-        self._last_image_hash: Optional[int] = None
+        self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
 
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
 
+    def _init_mtmd_context(self, llama_model: llama.Llama):
+        """Initialize mtmd context with the llama model."""
+        if self.mtmd_ctx is not None:
+            return  # Already initialized
+
         with suppress_stdout_stderr(disable=self.verbose):
-            clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
+            # Get default parameters
+            ctx_params = self._mtmd_cpp.mtmd_context_params_default()
+            ctx_params.use_gpu = True # TODO: Make this configurable
+            ctx_params.print_timings = self.verbose
+            ctx_params.n_threads = llama_model.n_threads
+            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+
+            # Initialize mtmd context
+            self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
+                self.clip_model_path.encode(),
+                llama_model.model,
+                ctx_params
+            )
 
-            if clip_ctx is None:
-                raise ValueError(f"Failed to load clip model: {clip_model_path}")
+            if self.mtmd_ctx is None:
+                raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}")
 
-            self.clip_ctx = clip_ctx
+            # Check if vision is supported
+            if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx):
+                raise ValueError("Vision is not supported by this model")
 
-            def clip_free():
+            def mtmd_free():
                 with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.clip_free(self.clip_ctx)
-
-            self._exit_stack.callback(clip_free)
+                    if self.mtmd_ctx is not None:
+                        self._mtmd_cpp.mtmd_free(self.mtmd_ctx)
+                        self.mtmd_ctx = None
 
-        def last_image_embed_free():
-            with suppress_stdout_stderr(disable=self.verbose):
-                if self._last_image_embed is not None:
-                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
-                    self._last_image_embed = None
-
-        self._exit_stack.callback(last_image_embed_free)
+            self._exit_stack.callback(mtmd_free)
 
     def load_image(self, image_url: str) -> bytes:
         return self._load_image(image_url)
 
-    def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
-        if (
-            self._last_image_embed is not None
-            and self._last_image_hash is not None
-            and hash(image_bytes) == self._last_image_hash
-        ):
-            return self._last_image_embed
+    def _create_bitmap_from_bytes(self, image_bytes: bytes):
+        """Create mtmd_bitmap from image bytes."""
+        if self.mtmd_ctx is None:
+            raise ValueError("mtmd context not initialized")
+
         with suppress_stdout_stderr(disable=self.verbose):
-            # Free the previous image embed
-            if self._last_image_embed is not None:
-                self._llava_cpp.llava_image_embed_free(self._last_image_embed)
-                self._last_image_embed = None
-                self._last_image_hash = None
-            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
-                self.clip_ctx,
-                n_threads_batch,
-                (ctypes.c_uint8 * len(image_bytes)).from_buffer(
-                    bytearray(image_bytes)
-                ),
-                len(image_bytes),
+            # Create bitmap from buffer using helper function
+            bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+                self.mtmd_ctx,
+                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
+                len(image_bytes)
             )
-            self._last_image_embed = embed
-            self._last_image_hash = hash(image_bytes)
-            return embed
+            
+            if bitmap is None:
+                raise ValueError("Failed to create bitmap from image bytes")
+            
+            return bitmap
 
     def __call__(
         self,
@@ -2794,7 +2797,9 @@ def __call__(
         llama_types.CreateChatCompletionResponse,
         Iterator[llama_types.CreateChatCompletionStreamResponse],
     ]:
-        assert self.clip_ctx is not None
+        # Initialize mtmd context
+        self._init_mtmd_context(llama)
+        assert self.mtmd_ctx is not None
 
         system_prompt = _get_system_message(messages)
         if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
@@ -2809,54 +2814,131 @@ def __call__(
             trim_blocks=True,
             lstrip_blocks=True,
         ).from_string(self.CHAT_FORMAT)
+        
+        # Get the default media marker
+        media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
+        
+        # Replace image URLs with media markers in the template
         text = template.render(
             messages=messages,
             add_generation_prompt=True,
             eos_token=llama.detokenize([llama.token_eos()]),
             bos_token=llama.detokenize([llama.token_bos()]),
         )
-        split_text = self.split_text_on_image_urls(text, image_urls)
+        
+        # Replace image URLs in text with media markers
+        for image_url in image_urls:
+            text = text.replace(image_url, media_marker)
 
         if self.verbose:
             print(text, file=sys.stderr)
 
+        # Create bitmaps from images
+        bitmaps = []
+        bitmap_cleanup = []
+        try:
+            for image_url in image_urls:
+                image_bytes = self.load_image(image_url)
+                bitmap = self._create_bitmap_from_bytes(image_bytes)
+                bitmaps.append(bitmap)
+                bitmap_cleanup.append(bitmap)
+
+            # Create input text structure
+            input_text = self._mtmd_cpp.mtmd_input_text()
+            input_text.text = text.encode('utf-8')
+            input_text.add_special = True
+            input_text.parse_special = True
+
+            # Create input chunks
+            chunks = self._mtmd_cpp.mtmd_input_chunks_init()
+            if chunks is None:
+                raise ValueError("Failed to create input chunks")
 
-        # Evaluate prompt
-        llama.reset()
-        llama._ctx.kv_cache_clear()
-        for type_, value in split_text:
-            if type_ == "text":
-                tokens = llama.tokenize(
-                    value.encode("utf8"), add_bos=False, special=True
+            try:
+                # Tokenize text and images together
+                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
+                result = self._mtmd_cpp.mtmd_tokenize(
+                    self.mtmd_ctx,
+                    chunks,
+                    ctypes.byref(input_text),
+                    bitmap_array,
+                    len(bitmaps)
                 )
-                if llama.n_tokens + len(tokens) > llama.n_ctx():
-                    raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
-                    )
-                llama.eval(tokens)
-            else:
-                image_bytes = self.load_image(value)
-                embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
-                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
-                    raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
-                    )
-                n_past = ctypes.c_int(llama.n_tokens)
-                n_past_p = ctypes.pointer(n_past)
-                with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.llava_eval_image_embed(
-                        llama.ctx,
-                        embed,
-                        llama.n_batch,
-                        n_past_p,
-                    )
-                # Required to avoid issues with hf tokenizer
-                llama.input_ids[llama.n_tokens : n_past.value] = -1
-                llama.n_tokens = n_past.value
 
-        # Get prompt tokens to avoid a cache miss
-        prompt = llama.input_ids[: llama.n_tokens].tolist()
+                if result != 0:
+                    raise ValueError(f"Failed to tokenize input: error code {result}")
+
+                # Reset llama context
+                llama.reset()
+                llama._ctx.kv_cache_clear()
+
+                # Process each chunk
+                n_past = llama_cpp.llama_pos(0)
+                n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
+                
+                for i in range(n_chunks):
+                    chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
+                    if chunk is None:
+                        continue
+
+                    chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
+                    
+                    if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                        # Handle text chunk
+                        n_tokens_out = ctypes.c_size_t()
+                        tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
+                            chunk, ctypes.byref(n_tokens_out)
+                        )
+                        
+                        if tokens_ptr and n_tokens_out.value > 0:
+                            # Convert ctypes array to Python list
+                            tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
+                            
+                            if llama.n_tokens + len(tokens) > llama.n_ctx():
+                                raise ValueError(
+                                    f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
+                                )
+                            llama.eval(tokens)
+                    
+                    elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
+                        # Handle image/audio chunk using helper
+                        chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
+                        
+                        if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
+                            raise ValueError(
+                                f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
+                            )
+                        
+                        new_n_past = llama_cpp.llama_pos(0)
+                        result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
+                            self.mtmd_ctx,
+                            llama._ctx.ctx,
+                            chunk,
+                            llama_cpp.llama_pos(llama.n_tokens),
+                            llama_cpp.llama_seq_id(0),
+                            llama.n_batch,
+                            False,  # logits_last
+                            ctypes.byref(new_n_past)
+                        )
+                        
+                        if result != 0:
+                            raise ValueError(f"Failed to evaluate chunk: error code {result}")
+                        
+                        # Update llama's token count
+                        llama.n_tokens = new_n_past.value
+
+                # Get prompt tokens to avoid a cache miss
+                prompt = llama.input_ids[: llama.n_tokens].tolist()
 
+            finally:
+                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+
+        finally:
+            # Cleanup bitmaps
+            for bitmap in bitmap_cleanup:
+                self._mtmd_cpp.mtmd_bitmap_free(bitmap)
+
+        # Handle response format and tools (same as before)
         if response_format is not None and response_format["type"] == "json_object":
             grammar = _grammar_for_response_format(response_format)
 
@@ -2931,6 +3013,7 @@ def __call__(
             grammar=grammar,
             logit_bias=logit_bias,
         )
+        
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
@@ -2943,12 +3026,10 @@ def _load_image(image_url: str) -> bytes:
         # TODO: Add Pillow support for other image formats beyond (jpg, png)
         if image_url.startswith("data:"):
             import base64
-
             image_bytes = base64.b64decode(image_url.split(",")[1])
             return image_bytes
         else:
             import urllib.request
-
             with urllib.request.urlopen(image_url) as f:
                 image_bytes = f.read()
                 return image_bytes
@@ -2974,6 +3055,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
 
     @staticmethod
     def split_text_on_image_urls(text: str, image_urls: List[str]):
+        """This method is no longer used in the new implementation."""
         def find_first(s: str, substrs: List[str]):
             for i, substr in enumerate(substrs):
                 pos = s.find(substr)
@@ -3373,6 +3455,61 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
     )
 
 
+class Qwen25VLChatHandler(Llava15ChatHandler):
+    DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
+
+    CHAT_FORMAT = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] }}"
+        "{% else %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] == 'text' %}"
+        "{{ content['text'] }}"
+        "{% elif content['type'] == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% else %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "<|im_start|>assistant\n"
+    )
+
+    def __call__(self, **kwargs):
+        llama = kwargs['llama']
+
+        # Clear state for multiple runs
+        llama.reset()
+        llama._ctx.kv_cache_clear()
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        # Clear any handler state
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            image_count = len(self.get_image_urls(messages))
+            print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr)
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 16c6b9d73..d13d60458 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -149,6 +149,10 @@
 # define LLAMA_STATE_SEQ_VERSION 2
 LLAMA_STATE_SEQ_VERSION = 2
 
+# struct llama_vocab;
+llama_vocab_p = NewType("llama_vocab_p", int)
+llama_vocab_p_ctypes = ctypes.c_void_p
+
 # struct llama_model;
 llama_model_p = NewType("llama_model_p", int)
 llama_model_p_ctypes = ctypes.c_void_p
@@ -157,9 +161,13 @@
 llama_context_p = NewType("llama_context_p", int)
 llama_context_p_ctypes = ctypes.c_void_p
 
-# # struct llama_sampler;
-# llama_sampler_p = NewType("llama_sampler_p", int)
-# llama_sampler_p_ctypes = ctypes.c_void_p
+# typedef struct llama_memory_i * llama_memory_t;
+llama_memory_t = NewType("llama_memory_t", int)
+llama_memory_t_ctypes = ctypes.c_void_p
+
+# struct llama_kv_cache; (DEPRECATED)
+llama_kv_cache_p = NewType("llama_kv_cache_p", int)
+llama_kv_cache_p_ctypes = ctypes.c_void_p
 
 # typedef int32_t llama_pos;
 llama_pos = ctypes.c_int32
@@ -223,6 +231,13 @@
 #     LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
 #     LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
 #     LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+#     LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
+#     LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
+#     LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
+#     LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
+#     LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+#     LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+#     LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -240,7 +255,7 @@
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
 LLAMA_VOCAB_PRE_TYPE_PORO = 15
-LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
+LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
 LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
 LLAMA_VOCAB_PRE_TYPE_VIKING = 18
 LLAMA_VOCAB_PRE_TYPE_JAIS = 19
@@ -253,6 +268,13 @@
 LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
+LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
+LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
+LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
+LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
+LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
+LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
+LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -401,14 +423,14 @@
 #     LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
 #     LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
 #     LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
-#     LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
+#     LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
 # };
 LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
 LLAMA_ROPE_SCALING_TYPE_NONE = 0
 LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
 LLAMA_ROPE_SCALING_TYPE_YARN = 2
 LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3
-LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
+LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE
 
 # enum llama_pooling_type {
 #     LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
@@ -438,7 +460,7 @@
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
 #     LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-#     LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
+#     LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
 # };
 LLAMA_SPLIT_MODE_NONE = 0
 LLAMA_SPLIT_MODE_LAYER = 1
@@ -512,18 +534,21 @@ class llama_token_data_array(ctypes.Structure):
 )
 
 
-# // Input data for llama_decode
+# // Input data for llama_encode/llama_decode
 # // A llama_batch object can contain input about one or many sequences
 # // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 # //
 # // - token  : the token ids of the input (used when embd is NULL)
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
-# //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+# //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
 # // - seq_id : the sequence to which the respective token belongs
 # //            (if set to NULL, the sequence ID will be assumed to be 0)
 # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-# //            (if set to NULL, only the logits for last token will be returned)
+# //            (if set to NULL:
+# //               - if embeddings: all tokens are output
+# //               - if not:        only the last token is output
+# //            )
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@@ -533,10 +558,10 @@ class llama_token_data_array(ctypes.Structure):
 #     llama_pos    *  pos;
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
-#     int8_t       *  logits; // TODO: rename this to "output"
+#     int8_t       *  logits;   // TODO: rename this to "output"
 # } llama_batch;
 class llama_batch(ctypes.Structure):
-    """Input data for llama_decode
+    """Input data for llama_encode/llama_decode
 
     A llama_batch object can contain input about one or many sequences
 
@@ -624,25 +649,28 @@ class llama_model_kv_override(ctypes.Structure):
         value: Union[int, float, bool, bytes]
 
 
+# struct llama_model_tensor_buft_override {
+#     const char * pattern;
+#     ggml_backend_buffer_type_t buft;
+# };
+
+
 # struct llama_model_params {
 #     // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
 #     ggml_backend_dev_t * devices;
 
+#     // NULL-terminated list of buffer types to use for tensors that match a pattern
+#     const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
-#     // main_gpu interpretation depends on split_mode:
-#     // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
-#     // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
-#     // LLAMA_SPLIT_MODE_LAYER: ignored
+#     // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
 #     int32_t main_gpu;
 
 #     // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
 #     const float * tensor_split;
 
-#     // comma separated list of RPC servers to use for offloading
-#     const char * rpc_servers;
-
 #     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 #     // If the provided progress_callback returns true, model loading continues.
 #     // If it returns false, model loading is immediately aborted.
@@ -654,7 +682,6 @@ class llama_model_kv_override(ctypes.Structure):
 #     // override key-value pairs of the model meta data
 #     const struct llama_model_kv_override * kv_overrides;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only;    // only load the vocabulary, no weights
 #     bool use_mmap;      // use mmap if possible
@@ -665,11 +692,12 @@ class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
 
     Attributes:
+        devices (ctypes.Array[ggml_backend_dev_t]): NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        tensor_buft_overrides (ctypes.Array[llama_model_tensor_buft_override]): NULL-terminated list of buffer types to use for tensors that match a pattern
         n_gpu_layers (int): number of layers to store in VRAM
         split_mode (int): how to split the model across multiple GPUs
-        main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
+        main_gpu (int): the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
         progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -679,11 +707,12 @@ class llama_model_params(ctypes.Structure):
         check_tensors (bool): validate model tensor data"""
 
     if TYPE_CHECKING:
+        devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
+        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
         n_gpu_layers: int
         split_mode: int
         main_gpu: int
         tensor_split: CtypesArray[ctypes.c_float]
-        rpc_servers: ctypes.c_char_p
         progress_callback: Callable[[float, ctypes.c_void_p], bool]
         progress_callback_user_data: ctypes.c_void_p
         kv_overrides: CtypesArray[llama_model_kv_override]
@@ -694,11 +723,11 @@ class llama_model_params(ctypes.Structure):
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
+        ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
         ("tensor_split", ctypes.POINTER(ctypes.c_float)),
-        ("rpc_servers", ctypes.c_char_p),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", ctypes.c_void_p),
         ("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
@@ -710,7 +739,7 @@ class llama_model_params(ctypes.Structure):
 
 
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-# //       https://github.com/ggerganov/llama.cpp/pull/7544
+# //       https://github.com/ggml-org/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t n_ctx;             // text context, 0 = from model
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
@@ -723,7 +752,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
 
-#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
 #     float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
 #     float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
@@ -731,7 +760,7 @@ class llama_model_params(ctypes.Structure):
 #     float    yarn_beta_fast;   // YaRN low correction dim
 #     float    yarn_beta_slow;   // YaRN high correction dim
 #     uint32_t yarn_orig_ctx;    // YaRN original context size
-#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
 
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
@@ -739,19 +768,21 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
 #     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
-#     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-#     bool embeddings;  // if true, extract embeddings (together with logits)
-#     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-#     bool no_perf;     // whether to measure performance timings
-
-
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
 #     ggml_abort_callback abort_callback;
 #     void *              abort_callback_data;
+
+#     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+#     bool embeddings;  // if true, extract embeddings (together with logits)
+#     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+#     bool flash_attn;  // use flash attention [EXPERIMENTAL]
+#     bool no_perf;     // measure performance timings
+#     bool op_offload;  // offload host tensor operations to device
+#     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+#                       // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+#                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -773,17 +804,19 @@ class llama_context_params(ctypes.Structure):
         yarn_beta_fast (float): YaRN low correction dim
         yarn_beta_slow (float): YaRN high correction dim
         yarn_orig_ctx (int): YaRN original context size
-        defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
+        defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
         cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
-        logits_all (bool): the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
+        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         flash_attn (bool): whether to use flash attention
-        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
-        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
+        no_perf (bool): whether to measure performance timings
+        op_offload (bool): offload host tensor operations to device
+        swa_full (bool): use full-size SWA cache
     """
 
     if TYPE_CHECKING:
@@ -808,12 +841,14 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data: ctypes.c_void_p
         type_k: int
         type_v: int
-        logits_all: bool
+        abort_callback: Callable[[ctypes.c_void_p], bool]
+        abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
         flash_attn: bool
-        abort_callback: Callable[[ctypes.c_void_p], bool]
-        abort_callback_data: ctypes.c_void_p
+        no_perf: bool
+        op_offload: bool
+        swa_full: bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -837,12 +872,14 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
-        ("logits_all", ctypes.c_bool),
+        ("abort_callback", ggml_abort_callback),
+        ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
         ("flash_attn", ctypes.c_bool),
-        ("abort_callback", ggml_abort_callback),
-        ("abort_callback_data", ctypes.c_void_p),
+        ("no_perf", ctypes.c_bool),
+        ("op_offload", ctypes.c_bool),
+        ("swa_full", ctypes.c_bool),
     ]
 
 
@@ -864,17 +901,19 @@ class llama_context_params(ctypes.Structure):
 
 # // model quantization parameters
 # typedef struct llama_model_quantize_params {
-#     int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-#     enum llama_ftype ftype;              // quantize to this llama_ftype
-#     enum ggml_type output_tensor_type;   // output tensor type
-#     enum ggml_type token_embedding_type; // token embeddings tensor type
-#     bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-#     bool quantize_output_tensor;         // quantize output.weight
-#     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-#     bool pure;                           // quantize all tensors to the default type
-#     bool keep_split;                     // quantize to the same number of shards
-#     void * imatrix;                      // pointer to importance matrix data
-#     void * kv_overrides;                 // pointer to vector containing overrides
+#     int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+#     enum llama_ftype ftype;               // quantize to this llama_ftype
+#     enum ggml_type output_tensor_type;    // output tensor type
+#     enum ggml_type token_embedding_type;  // token embeddings tensor type
+#     bool allow_requantize;                // allow quantizing non-f32/f16 tensors
+#     bool quantize_output_tensor;          // quantize output.weight
+#     bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+#     bool pure;                            // quantize all tensors to the default type
+#     bool keep_split;                      // quantize to the same number of shards
+#     void * imatrix;                       // pointer to importance matrix data
+#     void * kv_overrides;                  // pointer to vector containing overrides
+#     void * tensor_types;                  // pointer to vector containing tensor types
+#     void * prune_layers;                  // pointer to vector containing layer indices to prune
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
     """Parameters for llama_model_quantize
@@ -891,6 +930,8 @@ class llama_model_quantize_params(ctypes.Structure):
         keep_split (bool): quantize to the same number of shards
         imatrix (ctypes.c_void_p): pointer to importance matrix data
         kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
+        tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
+        prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune
     """
 
     if TYPE_CHECKING:
@@ -905,6 +946,8 @@ class llama_model_quantize_params(ctypes.Structure):
         keep_split: bool
         imatrix: ctypes.c_void_p
         kv_overrides: ctypes.c_void_p
+        tensor_types: ctypes.c_void_p
+        prune_layers: ctypes.c_void_p
 
     _fields_ = [
         ("nthread", ctypes.c_int32),
@@ -918,6 +961,8 @@ class llama_model_quantize_params(ctypes.Structure):
         ("keep_split", ctypes.c_bool),
         ("imatrix", ctypes.c_void_p),
         ("kv_overrides", ctypes.c_void_p),
+        ("tensor_types", ctypes.c_void_p),
+        ("prune_layers", ctypes.c_void_p),
     ]
 
 
@@ -975,9 +1020,9 @@ class llama_chat_message(ctypes.Structure):
 
 
 # // lora adapter
-# struct llama_lora_adapter;
-llama_lora_adapter_p = ctypes.c_void_p
-llama_lora_adapter_p_ctypes = ctypes.POINTER(ctypes.c_void_p)
+# struct llama_adapter_lora;
+llama_adapter_lora_p = ctypes.c_void_p
+llama_adapter_lora_p_ctypes = ctypes.POINTER(ctypes.c_void_p)
 
 
 # // Helpers for getting default parameters
@@ -1028,7 +1073,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_backend_init(bool numa);
 # LLAMA_API void llama_backend_init(void);
 @ctypes_function(
     "llama_backend_init",
@@ -1037,7 +1081,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
 )
 def llama_backend_init():
     """Initialize the llama + ggml backend
-    If numa is true, use NUMA optimizations
     Call once at the start of the program"""
     ...
 
@@ -1059,6 +1102,18 @@ def llama_backend_init():
 GGML_NUMA_STRATEGY_COUNT = 5
 
 
+# // Call once at the end of the program - currently only used for MPI
+# LLAMA_API void llama_backend_free(void);
+@ctypes_function(
+    "llama_backend_free",
+    [],
+    None,
+)
+def llama_backend_free():
+    """Call once at the end of the program - currently only used for MPI"""
+    ...
+
+
 # //optional:
 # LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 @ctypes_function(
@@ -1072,24 +1127,14 @@ def llama_numa_init(numa: int, /):
 
 # // Optional: an auto threadpool gets created in ggml if not passed explicitly
 # LLAMA_API void llama_attach_threadpool(
-#            struct   llama_context * ctx,
-#         ggml_threadpool_t   threadpool,
-#         ggml_threadpool_t   threadpool_batch);
+#         struct llama_context * ctx,
+#            ggml_threadpool_t   threadpool,
+#            ggml_threadpool_t   threadpool_batch);
+# TODO: Add llama_attach_threadpool
 
 
 # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-
-
-# // Call once at the end of the program - currently only used for MPI
-# LLAMA_API void llama_backend_free(void);
-@ctypes_function(
-    "llama_backend_free",
-    [],
-    None,
-)
-def llama_backend_free():
-    """Call once at the end of the program - currently only used for MPI"""
-    ...
+# TODO: Add llama_detach_threadpool
 
 
 # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1107,6 +1152,9 @@ def llama_load_model_from_file(
     ...
 
 
+# // Load the model from a file
+# // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
+# // If the split file name does not follow this pattern, use llama_model_load_from_splits
 # LLAMA_API struct llama_model * llama_model_load_from_file(
 #                          const char * path_model,
 #           struct llama_model_params   params);
@@ -1118,10 +1166,49 @@ def llama_load_model_from_file(
 def llama_model_load_from_file(
     path_model: bytes, params: llama_model_params, /
 ) -> Optional[llama_model_p]:
+    """Load the model from a file
+
+    If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
+
+    If the split file name does not follow this pattern, use llama_model_load_from_splits"""
+    ...
+
+
+# // Load the model from multiple splits (support custom naming scheme)
+# // The paths must be in the correct order
+# LLAMA_API struct llama_model * llama_model_load_from_splits(
+#                          const char ** paths,
+#                              size_t    n_paths,
+#           struct llama_model_params    params);
+@ctypes_function(
+    "llama_model_load_from_splits",
+    [ctypes.POINTER(ctypes.c_char_p), ctypes.c_size_t, llama_model_params],
+    llama_model_p_ctypes,
+)
+def llama_model_load_from_splits(
+    paths: List[bytes], n_paths: int, params: llama_model_params, /
+) -> Optional[llama_model_p]:
+    """Load the model from multiple splits (support custom naming scheme)
+
+    The paths must be in the correct order"""
+    ...
+
+
+# LLAMA_API void llama_model_save_to_file(
+#         const struct llama_model * model,
+#                     const char * path_model);
+@ctypes_function(
+    "llama_model_save_to_file",
+    [llama_model_p_ctypes, ctypes.c_char_p],
+    None,
+)
+def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
+    """Save the model to a file"""
     ...
 
 
-# LLAMA_API void llama_free_model(struct llama_model * model);
+# DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+#         "use llama_model_free instead");
 @ctypes_function(
     "llama_free_model",
     [llama_model_p_ctypes],
@@ -1141,9 +1228,24 @@ def llama_model_free(model: llama_model_p, /):
     ...
 
 
-# LLAMA_API struct llama_context * llama_new_context_with_model(
+# LLAMA_API struct llama_context * llama_init_from_model(
 #                  struct llama_model * model,
 #         struct llama_context_params   params);
+@ctypes_function(
+    "llama_init_from_model",
+    [llama_model_p_ctypes, llama_context_params],
+    llama_context_p_ctypes,
+)
+def llama_init_from_model(
+    model: llama_model_p, params: llama_context_params, /
+) -> Optional[llama_context_p]:
+    ...
+
+
+# DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
+#                  struct llama_model * model,
+#         struct llama_context_params   params),
+#         "use llama_init_from_model instead");
 @ctypes_function(
     "llama_new_context_with_model",
     [llama_model_p_ctypes, llama_context_params],
@@ -1183,6 +1285,12 @@ def llama_max_devices() -> int:
     ...
 
 
+# LLAMA_API size_t llama_max_parallel_sequences(void);
+@ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t)
+def llama_max_parallel_sequences() -> int:
+    ...
+
+
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
 def llama_supports_mmap() -> bool:
@@ -1231,65 +1339,147 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int:
     ...
 
 
-# LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_model_p, /) -> int:
-    ...
-
-
-# LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
+# DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_ctx_train(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+# DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
 @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_embd(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
+# DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
 @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_layer(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
+# DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_head(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+# DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
+@ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32)
+def llama_n_vocab(model: llama_vocab_p, /) -> int:
+    ...
+
+
+# LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
 def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
     ...
 
 
-# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
+# LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
+@ctypes_function("llama_get_memory", [llama_context_p_ctypes], llama_memory_t_ctypes)
+def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]:
+    """Get the memory for the context"""
+    ...
+
+
+# LLAMA_API  enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
 def llama_pooling_type(ctx: llama_context_p, /) -> int:
     ...
 
 
-# LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_vocab_type(model: llama_model_p, /) -> int:
+# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
+@ctypes_function(
+    "llama_get_kv_self",
+    [llama_context_p_ctypes],
+    llama_kv_cache_p_ctypes,
+)
+def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+    """Get the KV cache for self-attention (DEPRECATED)"""
+    ...
+
+
+# LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
+@ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
+def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
+    ...
+
+
+# LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
+@ctypes_function("llama_model_rope_type", [llama_model_p_ctypes], ctypes.c_int)
+def llama_model_rope_type(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
+@ctypes_function("llama_model_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_ctx_train(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+@ctypes_function("llama_model_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
+@ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_layer(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_rope_type(model: llama_model_p, /) -> int:
+# LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
+@ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_head(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_head_kv(model: llama_model_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
+@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_swa(model: llama_model_p, /) -> int:
     ...
 
 
 # // Get the model's RoPE frequency scaling factor
-# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
-@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
-def llama_rope_freq_scale_train(model: llama_model_p, /) -> float:
-    """Get the model's RoPE frequency scaling factor"""
+# LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
+@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
+def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float:
+    ...
+
+
+# // Returns the number of classifier outputs (only valid for classifier models)
+# // Undefined behavior for non-classifier models
+# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32)
+def llama_model_n_cls_out(model: llama_model_p, /) -> int:
+    """Returns the number of classifier outputs (only valid for classifier models)"""
+    ...
+
+
+# // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+# LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
+@ctypes_function("llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p)
+def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
+    """Returns label of classifier output by index. Returns None if no label provided"""
+    ...
+
+
+# LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
+@ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int)
+def llama_vocab_type(vocab: llama_vocab_p, /) -> int:
+    ...
+
+
+# LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
+@ctypes_function("llama_vocab_n_tokens", [llama_vocab_p_ctypes], ctypes.c_int32)
+def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int:
     ...
 
 
@@ -1402,6 +1592,16 @@ def llama_model_size(model: llama_model_p, /) -> int:
     ...
 
 
+# // Get the default chat template. Returns nullptr if not available
+# // If name is NULL, returns the default chat template
+# LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
+@ctypes_function("llama_model_chat_template", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_char_p)
+def llama_model_chat_template(model: llama_model_p, name: Optional[bytes], /) -> Optional[bytes]:
+    """Get the default chat template. Returns None if not available
+    If name is None, returns the default chat template"""
+    ...
+
+
 # // Returns the total number of parameters in the model
 # LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 @ctypes_function("llama_model_n_params", [llama_model_p_ctypes], ctypes.c_uint64)
@@ -1471,38 +1671,53 @@ def llama_model_quantize(
     ...
 
 
+# //
+# // Adapters
+# //
+
 # // Load a LoRA adapter from file
-# // The loaded adapter will be associated to the given model, and will be free when the model is deleted
-# LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+# LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 #         struct llama_model * model,
 #         const char * path_lora);
 @ctypes_function(
-    "llama_lora_adapter_init",
+    "llama_adapter_lora_init",
     [llama_model_p_ctypes, ctypes.c_char_p],
-    llama_lora_adapter_p_ctypes,
+    llama_adapter_lora_p_ctypes,
 )
-def llama_lora_adapter_init(
+def llama_adapter_lora_init(
     model: llama_model_p, path_lora: bytes, /
-) -> Optional[llama_lora_adapter_p]:
-    """Load a LoRA adapter from file
-    The loaded adapter will be associated to the given model, and will be free when the model is deleted
-    """
+) -> Optional[llama_adapter_lora_p]:
+    ...
+
+
+# // Manually free a LoRA adapter
+# // Note: loaded adapters will be free when the associated model is deleted
+# LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_lora_free",
+    [llama_adapter_lora_p_ctypes],
+    None,
+)
+def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
     ...
 
 
+# // The following functions operate on a llama_context, hence the naming: llama_verb_...
+
+
 # // Add a loaded LoRA adapter to given context
 # // This will not modify model's weight
-# LLAMA_API int32_t llama_lora_adapter_set(
+# LLAMA_API int32_t llama_set_adapter_lora(
 #         struct llama_context * ctx,
-#         struct llama_lora_adapter * adapter,
+#         struct llama_adapter_lora * adapter,
 #         float scale);
 @ctypes_function(
-    "llama_lora_adapter_set",
-    [llama_context_p_ctypes, llama_lora_adapter_p_ctypes, ctypes.c_float],
+    "llama_set_adapter_lora",
+    [llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float],
     ctypes.c_int32,
 )
-def llama_lora_adapter_set(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, scale: float, /
+def llama_set_adapter_lora(
+    ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, /
 ) -> int:
     """Add a loaded LoRA adapter to given context
     This will not modify model's weight"""
@@ -1511,64 +1726,49 @@ def llama_lora_adapter_set(
 
 # // Remove a specific LoRA adapter from given context
 # // Return -1 if the adapter is not present in the context
-# LLAMA_API int32_t llama_lora_adapter_remove(
+# LLAMA_API int32_t llama_rm_adapter_lora(
 #         struct llama_context * ctx,
-#         struct llama_lora_adapter * adapter);
+#         struct llama_adapter_lora * adapter);
 @ctypes_function(
-    "llama_lora_adapter_remove",
-    [llama_context_p_ctypes, llama_lora_adapter_p_ctypes],
+    "llama_rm_adapter_lora",
+    [llama_context_p_ctypes, llama_adapter_lora_p_ctypes],
     ctypes.c_int32,
 )
-def llama_lora_adapter_remove(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, /
+def llama_rm_adapter_lora(
+    ctx: llama_context_p, adapter: llama_adapter_lora_p, /
 ) -> int:
-    """Remove a LoRA adapter from given context
+    """Remove a specific LoRA adapter from given context
     Return -1 if the adapter is not present in the context"""
     ...
 
 
 # // Remove all LoRA adapters from given context
-# LLAMA_API void llama_lora_adapter_clear(
-#         struct llama_context * ctx);
+# LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
 @ctypes_function(
-    "llama_lora_adapter_clear",
+    "llama_clear_adapter_lora",
     [llama_context_p_ctypes],
     None,
 )
-def llama_lora_adapter_clear(ctx: llama_context_p, /):
+def llama_clear_adapter_lora(ctx: llama_context_p, /):
     """Remove all LoRA adapters from given context"""
     ...
 
 
-# // Manually free a LoRA adapter
-# // Note: loaded adapters will be free when the associated model is deleted
-# LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
-@ctypes_function(
-    "llama_lora_adapter_free",
-    [llama_lora_adapter_p_ctypes],
-    None,
-)
-def llama_lora_adapter_free(adapter: llama_lora_adapter_p, /):
-    """Manually free a LoRA adapter
-    Note: loaded adapters will be free when the associated model is deleted"""
-    ...
-
-
 # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
 # // the currently loaded vector.
 # // n_embd should be the size of a single layer's control, and data should point
 # // to an n_embd x n_layers buffer starting from layer 1.
 # // il_start and il_end are the layer range the vector should apply to (both inclusive)
 # // See llama_control_vector_load in common to load a control vector.
-# LLAMA_API int32_t llama_control_vector_apply(
-#         struct llama_context * lctx,
+# LLAMA_API int32_t llama_apply_adapter_cvec(
+#         struct llama_context * ctx,
 #                  const float * data,
 #                       size_t   len,
 #                      int32_t   n_embd,
 #                      int32_t   il_start,
 #                      int32_t   il_end);
 @ctypes_function(
-    "llama_control_vector_apply",
+    "llama_apply_adapter_cvec",
     [
         llama_context_p_ctypes,
         ctypes.POINTER(ctypes.c_float),
@@ -1579,8 +1779,8 @@ def llama_lora_adapter_free(adapter: llama_lora_adapter_p, /):
     ],
     ctypes.c_int32,
 )
-def llama_control_vector_apply(
-    lctx: llama_context_p,
+def llama_apply_adapter_cvec(
+    ctx: llama_context_p,
     data: CtypesPointerOrRef[ctypes.c_float],
     len: int,
     n_embd: int,
@@ -1598,148 +1798,22 @@ def llama_control_vector_apply(
 
 
 # //
-# // KV cache
+# // Memory
 # //
 
-
-# // Information associated with an individual cell in the KV cache view.
-# struct llama_kv_cache_view_cell {
-#     // The position for this cell. Takes KV cache shifts into account.
-#     // May be negative if the cell is not populated.
-#     llama_pos pos;
-# };
-class llama_kv_cache_view_cell(ctypes.Structure):
-    """Information associated with an individual cell in the KV cache view.
-
-    Attributes:
-        pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
-            May be negative if the cell is not populated."""
-
-    if TYPE_CHECKING:
-        pos: llama_pos
-
-    _fields_ = [("pos", llama_pos)]
-
-
-# // An updateable view of the KV cache.
-# struct llama_kv_cache_view {
-#     // Number of KV cache cells. This will be the same as the context size.
-#     int32_t n_cells;
-
-#     // Maximum number of sequences that can exist in a cell. It's not an error
-#     // if there are more sequences in a cell than this value, however they will
-#     // not be visible in the view cells_sequences.
-#     int32_t n_seq_max;
-
-#     // Number of tokens in the cache. For example, if there are two populated
-#     // cells, the first with 1 sequence id in it and the second with 2 sequence
-#     // ids then you'll have 3 tokens.
-#     int32_t token_count;
-
-#     // Number of populated cache cells.
-#     int32_t used_cells;
-
-#     // Maximum contiguous empty slots in the cache.
-#     int32_t max_contiguous;
-
-#     // Index to the start of the max_contiguous slot range. Can be negative
-#     // when cache is full.
-#     int32_t max_contiguous_idx;
-
-#     // Information for an individual cell.
-#     struct llama_kv_cache_view_cell * cells;
-
-
-#     // The sequences for each cell. There will be n_seq_max items per cell.
-#     llama_seq_id * cells_sequences;
-# };
-class llama_kv_cache_view(ctypes.Structure):
-    if TYPE_CHECKING:
-        n_cells: int
-        n_max_seq: int
-        token_count: int
-        used_cells: int
-        max_contiguous: int
-        max_contiguous_idx: int
-        cells: CtypesArray[llama_kv_cache_view_cell]
-        cells_sequences: CtypesArray[llama_seq_id]
-
-    _fields_ = [
-        ("n_cells", ctypes.c_int32),
-        ("n_max_seq", ctypes.c_int32),
-        ("token_count", ctypes.c_int32),
-        ("used_cells", ctypes.c_int32),
-        ("max_contiguous", ctypes.c_int32),
-        ("max_contiguous_idx", ctypes.c_int32),
-        ("cells", ctypes.POINTER(llama_kv_cache_view_cell)),
-        ("cells_sequences", ctypes.POINTER(llama_seq_id)),
-    ]
-
-
-llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)
-
-
-# // Create an empty KV cache view. (use only for debugging purposes)
-# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-@ctypes_function(
-    "llama_kv_cache_view_init",
-    [llama_context_p_ctypes, ctypes.c_int32],
-    llama_kv_cache_view,
-)
-def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
-) -> llama_kv_cache_view:
-    """Create an empty KV cache view. (use only for debugging purposes)"""
-    ...
-
-
-# // Free a KV cache view. (use only for debugging purposes)
-# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-@ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
-def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /):  # type: ignore
-    """Free a KV cache view. (use only for debugging purposes)"""
-    ...
-
-
-# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-@ctypes_function(
-    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None
-)
-def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /):  # type: ignore
-    """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
-    ...
-
-
-# // Returns the number of tokens in the KV cache (slow, use only for debug)
-# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
-@ctypes_function(
-    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug)
-    If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    """
-    ...
-
-
-# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+# // Clear the memory contents
+# // If data == true, the data buffers will also be cleared together with the metadata
+# LLAMA_API void llama_memory_clear(
+#         llama_memory_t mem,
+#                   bool data);
 @ctypes_function(
-    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+    "llama_memory_clear",
+    [llama_memory_t_ctypes, ctypes.c_bool],
+    None,
 )
-def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
-    ...
-
-
-# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# LLAMA_API void llama_kv_cache_clear(
-#         struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
-def llama_kv_cache_clear(ctx: llama_context_p, /):
-    """Clear the KV cache"""
+def llama_memory_clear(mem: llama_memory_t, data: bool, /):
+    """Clear the memory contents
+    If data == true, the data buffers will also be cleared together with the metadata"""
     ...
 
 
@@ -1748,23 +1822,23 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 # // seq_id < 0 : match any sequence
 # // p0 < 0     : [0,  p1]
 # // p1 < 0     : [p0, inf)
-# LLAMA_API bool llama_kv_cache_seq_rm(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1);
+# LLAMA_API bool llama_memory_seq_rm(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1);
 @ctypes_function(
-    "llama_kv_cache_seq_rm",
+    "llama_memory_seq_rm",
     [
-        llama_context_p_ctypes,
+        llama_memory_t_ctypes,
         llama_seq_id,
         llama_pos,
         llama_pos,
     ],
     ctypes.c_bool,
 )
-def llama_kv_cache_seq_rm(
-    ctx: llama_context_p,
+def llama_memory_seq_rm(
+    mem: llama_memory_t,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
@@ -1781,19 +1855,18 @@ def llama_kv_cache_seq_rm(
 
 
 # // Copy all tokens that belong to the specified sequence to another sequence
-# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_cp(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id_src,
-#                 llama_seq_id   seq_id_dst,
-#                    llama_pos   p0,
-#                    llama_pos   p1);
+# LLAMA_API void llama_memory_seq_cp(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id_src,
+#           llama_seq_id seq_id_dst,
+#              llama_pos p0,
+#              llama_pos p1);
 @ctypes_function(
-    "llama_kv_cache_seq_cp",
+    "llama_memory_seq_cp",
     [
-        llama_context_p_ctypes,
+        llama_memory_t_ctypes,
         llama_seq_id,
         llama_seq_id,
         llama_pos,
@@ -1801,8 +1874,8 @@ def llama_kv_cache_seq_rm(
     ],
     None,
 )
-def llama_kv_cache_seq_cp(
-    ctx: llama_context_p,
+def llama_memory_seq_cp(
+    mem: llama_memory_t,
     seq_id_src: Union[llama_seq_id, int],
     seq_id_dst: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1810,40 +1883,36 @@ def llama_kv_cache_seq_cp(
     /,
 ):
     """Copy all tokens that belong to the specified sequence to another sequence
-    Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
     ...
 
 
 # // Removes all tokens that do not belong to the specified sequence
-# LLAMA_API void llama_kv_cache_seq_keep(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id);
+# LLAMA_API void llama_memory_seq_keep(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+    "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
     ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_add(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                    llama_pos   delta);
+# LLAMA_API void llama_memory_seq_add(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1,
+#              llama_pos delta);
 @ctypes_function(
-    "llama_kv_cache_seq_add",
+    "llama_memory_seq_add",
     [
-        llama_context_p_ctypes,
+        llama_memory_t_ctypes,
         llama_seq_id,
         llama_pos,
         llama_pos,
@@ -1851,8 +1920,8 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
     ],
     None,
 )
-def llama_kv_cache_seq_add(
-    ctx: llama_context_p,
+def llama_memory_seq_add(
+    mem: llama_memory_t,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
@@ -1860,28 +1929,24 @@ def llama_kv_cache_seq_add(
     /,
 ):
     """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    If the KV cache is RoPEd, the KV data is updated accordingly:
-    - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
     ...
 
 
 # // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_div(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                          int   d);
+# LLAMA_API void llama_memory_seq_div(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id,
+#              llama_pos p0,
+#              llama_pos p1,
+#                    int d);
 @ctypes_function(
-    "llama_kv_cache_seq_div",
+    "llama_memory_seq_div",
     [
-        llama_context_p_ctypes,
+        llama_memory_t_ctypes,
         llama_seq_id,
         llama_pos,
         llama_pos,
@@ -1889,8 +1954,8 @@ def llama_kv_cache_seq_add(
     ],
     None,
 )
-def llama_kv_cache_seq_div(
-    ctx: llama_context_p,
+def llama_memory_seq_div(
+    mem: llama_memory_t,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
@@ -1898,54 +1963,318 @@ def llama_kv_cache_seq_div(
     /,
 ):
     """Integer division of the positions by factor of `d > 1`
-    If the KV cache is RoPEd, the KV data is updated accordingly
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
     ...
 
 
-# // Defragment the KV cache
-# // This will be applied:
-# //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
-# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
-def llama_kv_cache_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache
-    This will be applied:
-    - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()"""
+# // Returns the smallest position present in the memory for the specified sequence
+# // This is typically non-zero only for SWA caches
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+# // Return -1 if the sequence is empty
+# LLAMA_API llama_pos llama_memory_seq_pos_min(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
+@ctypes_function(
+    "llama_memory_seq_pos_min", [llama_memory_t_ctypes, llama_seq_id], llama_pos
+)
+def llama_memory_seq_pos_min(
+    mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the smallest position present in the memory for the specified sequence
+    This is typically non-zero only for SWA caches
+    Return -1 if the sequence is empty"""
     ...
 
 
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
-def llama_kv_cache_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
+# // Returns the largest position present in the memory for the specified sequence
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+# // Return -1 if the sequence is empty
+# LLAMA_API llama_pos llama_memory_seq_pos_max(
+#         llama_memory_t mem,
+#           llama_seq_id seq_id);
+@ctypes_function(
+    "llama_memory_seq_pos_max", [llama_memory_t_ctypes, llama_seq_id], llama_pos
+)
+def llama_memory_seq_pos_max(
+    mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the largest position present in the memory for the specified sequence
+    Return -1 if the sequence is empty"""
     ...
 
 
-# // Check if the context supports KV cache shifting
-# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting"""
+# // Check if the memory supports shifting
+# LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
+@ctypes_function("llama_memory_can_shift", [llama_memory_t_ctypes], ctypes.c_bool)
+def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
+    """Check if the memory supports shifting"""
     ...
 
 
 # //
-# // State / sessions
+# // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
 # //
 
-
-# // Returns the *actual* size in bytes of the state
-# // (logits, embedding and kv_cache)
-# // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+# // Returns the number of tokens in the KV cache (slow, use only for debug)
+# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+@ctypes_function(
+    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+)
+def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
+    ...
+
+
+# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+@ctypes_function(
+    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+)
+def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
+    """Returns the number of used KV cells (DEPRECATED)"""
+    ...
+
+
+# // Clear the KV cache - both cell info is erased and KV data is zeroed
+# DEPRECATED(LLAMA_API void llama_kv_self_clear(
+#             struct llama_context * ctx),
+#         "Use llama_memory_clear() instead");
+@ctypes_function(
+    "llama_kv_self_clear", [llama_context_p_ctypes], None
+)
+def llama_kv_self_clear(ctx: llama_context_p, /):
+    """Clear the KV cache (DEPRECATED)"""
+    ...
+
+
+# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+# // seq_id < 0 : match any sequence
+# // p0 < 0     : [0,  p1]
+# // p1 < 0     : [p0, inf)
+# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#                    llama_pos   p0,
+#                    llama_pos   p1),
+#         "Use llama_memory_seq_rm() instead");
+@ctypes_function(
+    "llama_kv_self_seq_rm",
+    [
+        llama_context_p_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+    ],
+    ctypes.c_bool,
+)
+def llama_kv_self_seq_rm(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    /,
+) -> bool:
+    """Remove tokens from KV cache (DEPRECATED)"""
+    ...
+
+
+# // Copy all tokens that belong to the specified sequence to another sequence
+# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id_src,
+#                 llama_seq_id   seq_id_dst,
+#                    llama_pos   p0,
+#                    llama_pos   p1),
+#         "Use llama_memory_seq_cp() instead");
+@ctypes_function(
+    "llama_kv_self_seq_cp",
+    [
+        llama_context_p_ctypes,
+        llama_seq_id,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+    ],
+    None,
+)
+def llama_kv_self_seq_cp(
+    ctx: llama_context_p,
+    seq_id_src: Union[llama_seq_id, int],
+    seq_id_dst: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    /,
+):
+    """Copy tokens in KV cache (DEPRECATED)"""
+    ...
+
+
+# // Removes all tokens that do not belong to the specified sequence
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_keep() instead");
+@ctypes_function(
+    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+)
+def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+    """Keep only specified sequence in KV cache (DEPRECATED)"""
+    ...
+
+
+# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+# // If the KV cache is RoPEd, the KV data is updated accordingly:
+# //   - lazily on next llama_decode()
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#                    llama_pos   p0,
+#                    llama_pos   p1,
+#                    llama_pos   delta),
+#         "Use llama_memory_seq_add() instead");
+@ctypes_function(
+    "llama_kv_self_seq_add",
+    [
+        llama_context_p_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+        llama_pos,
+    ],
+    None,
+)
+def llama_kv_self_seq_add(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    delta: Union[llama_pos, int],
+    /,
+):
+    """Add delta to sequence positions in KV cache (DEPRECATED)"""
+    ...
+
+
+# // Integer division of the positions by factor of `d > 1`
+# // If the KV cache is RoPEd, the KV data is updated accordingly:
+# //   - lazily on next llama_decode()
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
+# DEPRECATED(void llama_kv_self_seq_div(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#                    llama_pos   p0,
+#                    llama_pos   p1,
+#                          int   d),
+#         "Use llama_memory_seq_div() instead");
+@ctypes_function(
+    "llama_kv_self_seq_div",
+    [
+        llama_context_p_ctypes,
+        llama_seq_id,
+        llama_pos,
+        llama_pos,
+        ctypes.c_int,
+    ],
+    None,
+)
+def llama_kv_self_seq_div(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    d: Union[ctypes.c_int, int],
+    /,
+):
+    """Divide sequence positions in KV cache (DEPRECATED)"""
+    ...
+
+
+# // Returns the smallest position present in the KV cache for the specified sequence
+# // This is typically non-zero only for SWA caches
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+# // Return -1 if the sequence is empty
+# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_pos_min() instead");
+@ctypes_function(
+    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
+)
+def llama_kv_self_seq_pos_min(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
+    ...
+
+
+# // Returns the largest position present in the KV cache for the specified sequence
+# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
+# // Return -1 if the sequence is empty
+# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id),
+#         "Use llama_memory_seq_pos_max() instead");
+@ctypes_function(
+    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
+)
+def llama_kv_self_seq_pos_max(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> int:
+    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
+    ...
+
+
+# // Defragment the KV cache
+# // This will be applied:
+# //   - lazily on next llama_decode()
+# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
+#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+def llama_kv_self_defrag(ctx: llama_context_p, /):
+    """Defragment the KV cache (DEPRECATED)"""
+    ...
+
+
+# // Check if the context supports KV cache shifting
+# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
+#         "use llama_memory_can_shift() instead");
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
+    """Check if the context supports KV cache shifting (DEPRECATED)"""
+    ...
+
+
+# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
+#         "simply remove this call, updates are applied lazily on the next llama_decode()");
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+def llama_kv_self_update(ctx: llama_context_p, /):
+    """Apply the KV cache updates (DEPRECATED)"""
+    ...
+
+
+# //
+# // State / sessions
+# //
+
+# // Returns the *actual* size in bytes of the state
+# // (logits, embedding and memory)
+# // Only use when saving the state, not when restoring it, otherwise the size may be too small.
 # LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
 @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_state_get_size(ctx: llama_context_p, /) -> int:
-    """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
+    """Returns the *actual* size in bytes of the state (logits, embedding and memory)"""
     ...
 
 
@@ -1953,8 +2282,7 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int:
 #     "use llama_state_get_size instead");
 @ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_get_state_size(ctx: llama_context_p, /) -> int:
-    """Returns the maximum size in bytes of the state (rng, logits, embedding
-    and kv_cache) - will often be smaller after compacting tokens"""
+    """Returns the size in bytes of the state (DEPRECATED)"""
     ...
 
 
@@ -2001,9 +2329,7 @@ def llama_state_get_data(
 def llama_copy_state_data(
     ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
 ) -> int:
-    """Copies the state to the specified destination address.
-    Destination needs to have allocated enough memory.
-    Returns the number of bytes copied"""
+    """Copies the state to the specified destination address (DEPRECATED)"""
     ...
 
 
@@ -2041,7 +2367,7 @@ def llama_state_set_data(
 def llama_set_state_data(
     ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
 ) -> int:
-    """Set the state reading from the specified address"""
+    """Set the state reading from the specified address (DEPRECATED)"""
     ...
 
 
@@ -2090,7 +2416,7 @@ def llama_state_load_file(
         ctypes.c_size_t,
         ctypes.POINTER(ctypes.c_size_t),
     ],
-    ctypes.c_size_t,
+    ctypes.c_bool,
 )
 def llama_load_session_file(
     ctx: llama_context_p,
@@ -2099,7 +2425,7 @@ def llama_load_session_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> int:
+) -> bool:
     ...
 
 
@@ -2142,7 +2468,7 @@ def llama_state_save_file(
         llama_token_p,
         ctypes.c_size_t,
     ],
-    ctypes.c_size_t,
+    ctypes.c_bool,
 )
 def llama_save_session_file(
     ctx: llama_context_p,
@@ -2150,11 +2476,11 @@ def llama_save_session_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> int:
+) -> bool:
     ...
 
 
-# // Get the exact size needed to copy the KV cache of a single sequence
+# // Get the exact size needed to copy the state of a single sequence
 # LLAMA_API size_t llama_state_seq_get_size(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
@@ -2164,11 +2490,11 @@ def llama_save_session_file(
     ctypes.c_size_t,
 )
 def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
-    """Get the exact size needed to copy the KV cache of a single sequence"""
+    """Get the exact size needed to copy the state of a single sequence"""
     ...
 
 
-# // Copy the KV cache of a single sequence into the specified buffer
+# // Copy the state of a single sequence into the specified buffer
 # LLAMA_API size_t llama_state_seq_get_data(
 #         struct llama_context * ctx,
 #                      uint8_t * dst,
@@ -2191,7 +2517,7 @@ def llama_state_seq_get_data(
     seq_id: llama_seq_id,
     /,
 ) -> int:
-    """Copy the KV cache of a single sequence into the specified buffer"""
+    """Copy the state of a single sequence into the specified buffer"""
     ...
 
 
@@ -2221,7 +2547,7 @@ def llama_state_seq_set_data(
     dest_seq_id: llama_seq_id,
     /,
 ) -> int:
-    """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
+    """Copy the sequence data into the specified sequence"""
     ...
 
 
@@ -2288,7 +2614,6 @@ def llama_state_seq_load_file(
 # // Decoding
 # //
 
-
 # // Return batch for single sequence of tokens
 # // The sequence ID will be fixed to 0
 # // The position of the tokens will be tracked automatically by llama_decode
@@ -2311,7 +2636,7 @@ def llama_batch_get_one(
     n_tokens: Union[ctypes.c_int, int],
     /,
 ) -> llama_batch:
-    """Return batch for single sequence of tokens starting at pos_0
+    """Return batch for single sequence of tokens
 
     NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     """
@@ -2356,35 +2681,46 @@ def llama_batch_free(batch: llama_batch, /):
     ...
 
 
-# // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-# // Stores the encoder output internally for later use by the decoder cross-attention layers.
+# // Process a batch of tokens.
+# // In contrast to llama_decode() - this call does not use KV cache.
+# // For encode-decoder contexts, processes the batch using the encoder.
+# // Can store the encoder output internally for later use by the decoder's cross-attention layers.
 # //   0 - success
-# // < 0 - error
+# // < 0 - error. the memory state is restored to the state before this call
 # LLAMA_API int32_t llama_encode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
 @ctypes_function("llama_encode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32)
 def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
-    """Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-    Stores the encoder output internally for later use by the decoder cross-attention layers.
+    """Process a batch of tokens using the encoder.
     0 - success
     < 0 - error"""
     ...
 
 
+# // Process a batch of tokens.
+# // Requires the context to have a memory.
+# // For encode-decoder contexts, processes the batch using the decoder.
 # // Positive return values does not mean a fatal error, but rather a warning.
-# //   0 - success
-# //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-# // < 0 - error
+# // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+# //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+# // Upon other return values, the memory state is restored to the state before this call
+# //    0 - success
+# //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+# //    2 - aborted     (processed ubatches will remain in the context's memory)
+# //   -1 - invalid input batch
+# // < -1 - fatal error (processed ubatches will remain in the context's memory)
 # LLAMA_API int32_t llama_decode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
 @ctypes_function("llama_decode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32)
 def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
-    """Positive return values does not mean a fatal error, but rather a warning.
+    """Process a batch of tokens.
     0 - success
     1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    < 0 - error"""
+    2 - aborted (processed ubatches will remain in the context's memory)
+    -1 - invalid input batch
+    < -1 - fatal error (processed ubatches will remain in the context's memory)"""
     ...
 
 
@@ -2430,13 +2766,12 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
     ...
 
 
-# // Set whether the model is in embeddings mode or not
-# // If true, embeddings will be returned but logits will not
+# // Set whether the context outputs embeddings or not
+# // TODO: rename to avoid confusion with llama_get_embeddings()
 # LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 @ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
 def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
-    """Set whether the model is in embeddings model or not
-    If true, embeddings will be returned but logits will not"""
+    """Set whether the context outputs embeddings or not"""
     ...
 
 
@@ -2450,6 +2785,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     ...
 
 
+# // Set whether the model is in warmup mode or not
+# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
+    """Set whether the model is in warmup mode or not
+    If true, all model tensors are activated during llama_decode() to load and cache their weights."""
+    ...
+
+
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 @ctypes_function(
@@ -2555,7 +2900,7 @@ def llama_get_embeddings_ith(
 
 # // Get the embeddings for a sequence id
 # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
 # // otherwise: float[n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 @ctypes_function(
@@ -2576,182 +2921,415 @@ def llama_get_embeddings_seq(
 # // Vocab
 # //
 
-
-# LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
+# LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p
+    "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
 )
-def llama_token_get_text(
-    model: llama_model_p, token: Union[llama_token, int], /
+def llama_vocab_get_text(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
 ) -> bytes:
     ...
 
 
-# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
+# LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float
+    "llama_vocab_get_score", [llama_vocab_p_ctypes, llama_token], ctypes.c_float
 )
-def llama_token_get_score(
-    model: llama_model_p, token: Union[llama_token, int], /
+def llama_vocab_get_score(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
 ) -> float:
     ...
 
 
-# LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
+# LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
+    "llama_vocab_get_attr", [llama_vocab_p_ctypes, llama_token], ctypes.c_int
 )
-def llama_token_get_attr(
-    model: llama_model_p, token: Union[llama_token, int], /
+def llama_vocab_get_attr(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
 ) -> int:
     ...
 
 
 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
-# LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+# LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+    "llama_vocab_is_eog", [llama_vocab_p_ctypes, llama_token], ctypes.c_bool
 )
-def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
+def llama_vocab_is_eog(vocab: llama_vocab_p, token: Union[llama_token, int], /) -> bool:
     """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
     ...
 
 
 # // Identify if Token Id is a control token or a render-able token
-# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+# LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+    "llama_vocab_is_control", [llama_vocab_p_ctypes, llama_token], ctypes.c_bool
 )
-def llama_token_is_control(
-    model: llama_model_p, token: Union[llama_token, int], /
+def llama_vocab_is_control(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
 ) -> bool:
     """Identify if Token Id is a control token or a render-able token"""
     ...
 
 
 # // Special tokens
-
-
-# LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-@ctypes_function("llama_token_bos", [llama_model_p_ctypes], llama_token)
-def llama_token_bos(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
+@ctypes_function("llama_vocab_bos", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_bos(vocab: llama_vocab_p, /) -> llama_token:
     """beginning-of-sentence"""
     ...
 
 
-# LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-@ctypes_function("llama_token_eos", [llama_model_p_ctypes], llama_token)
-def llama_token_eos(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
+@ctypes_function("llama_vocab_eos", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_eos(vocab: llama_vocab_p, /) -> llama_token:
     """end-of-sentence"""
     ...
 
 
-# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
-@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
+@ctypes_function("llama_vocab_eot", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_eot(vocab: llama_vocab_p, /) -> llama_token:
     """end-of-turn"""
     ...
 
 
-# LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
-@ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
-def llama_token_cls(model: llama_model_p, /) -> int:
-    """classification"""
+# LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
+@ctypes_function("llama_vocab_sep", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_sep(vocab: llama_vocab_p, /) -> llama_token:
+    """sentence separator"""
     ...
 
 
-# LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
-@ctypes_function("llama_token_sep", [llama_model_p_ctypes], llama_token)
-def llama_token_sep(model: llama_model_p, /) -> int:
-    """sentence separator"""
+# LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
+@ctypes_function("llama_vocab_nl", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_nl(vocab: llama_vocab_p, /) -> llama_token:
+    """next-line"""
     ...
 
 
-# LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
-@ctypes_function("llama_token_nl", [llama_model_p_ctypes], llama_token)
-def llama_token_nl(model: llama_model_p, /) -> int:
-    """next-line"""
+# LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+@ctypes_function("llama_vocab_pad", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
+    """padding"""
     ...
 
 
-# LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
-@ctypes_function("llama_add_bos_token", [llama_model_p_ctypes], ctypes.c_bool)
-def llama_add_bos_token(model: llama_model_p, /) -> bool:
+# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_get_add_bos",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool:
     ...
 
 
-# LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
-@ctypes_function("llama_add_eos_token", [llama_model_p_ctypes], ctypes.c_bool)
-def llama_add_eos_token(model: llama_model_p, /) -> bool:
+# LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_get_add_eos",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
     ...
 
 
-# // Codellama infill tokens
-# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
-@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
-def llama_token_prefix(model: llama_model_p) -> int:
-    """codellama infill tokens"""
+# LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_get_add_sep",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
     ...
 
 
-# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
-@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
-def llama_token_middle(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_pre",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 
-# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
-@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
-def llama_token_suffix(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_suf",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 
-# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
-@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_pre(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_mid",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
-# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
-@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_suf(model: llama_model_p, /) -> int:
+
+# LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_pad",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
-# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
-@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_mid(model: llama_model_p, /) -> int:
+
+# LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_rep",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
-# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
-@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_pad(model: llama_model_p, /) -> int:
+
+# LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
+@ctypes_function(
+    "llama_vocab_fim_sep",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
-# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
-@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_rep(model: llama_model_p, /) -> int:
+
+# DEPRECATED functions
+# DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
+@ctypes_function(
+    "llama_token_get_text",
+    [llama_vocab_p_ctypes, llama_token],
+    ctypes.c_char_p,
+)
+def llama_token_get_text(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> bytes:
     ...
 
-# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
-@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_sep(model: llama_model_p, /) -> int:
+
+# DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
+@ctypes_function(
+    "llama_token_get_score",
+    [llama_vocab_p_ctypes, llama_token],
+    ctypes.c_float,
+)
+def llama_token_get_score(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> float:
     ...
 
+# DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
+@ctypes_function(
+    "llama_token_get_attr",
+    [llama_vocab_p_ctypes, llama_token],
+    ctypes.c_int,
+)
+def llama_token_get_attr(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> int:
+    ...
+
+# DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
+@ctypes_function(
+    "llama_token_is_eog",
+    [llama_vocab_p_ctypes, llama_token],
+    ctypes.c_bool,
+)
+def llama_token_is_eog(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> bool:
+    ...
+
+# DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
+@ctypes_function(
+    "llama_token_is_control",
+    [llama_vocab_p_ctypes, llama_token],
+    ctypes.c_bool,
+)
+def llama_token_is_control(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> bool:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
+@ctypes_function(
+    "llama_token_bos",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_bos(vocab: llama_vocab_p, /) -> int:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
+@ctypes_function(
+    "llama_token_eos",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_eos(vocab: llama_vocab_p, /) -> int:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
+@ctypes_function(
+    "llama_token_eot",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_eot(vocab: llama_vocab_p, /) -> int:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
+@ctypes_function(
+    "llama_token_cls",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_cls(vocab: llama_vocab_p, /) -> int:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
+@ctypes_function(
+    "llama_token_sep",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_sep(vocab: llama_vocab_p, /) -> int:
+    ...
+
+
+# DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
+@ctypes_function(
+    "llama_token_nl",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_nl(vocab: llama_vocab_p, /) -> int:
+    ...
+
+
+# DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
+@ctypes_function(
+    "llama_token_pad",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_pad(vocab: llama_vocab_p, /) -> int:
+    ...
+
+
+# DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
+@ctypes_function(
+    "llama_add_bos_token",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
+    ...
+
+# DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
+@ctypes_function(
+    "llama_add_eos_token",
+    [llama_vocab_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
+    ...
+
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
+@ctypes_function(
+    "llama_token_fim_pre",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
+@ctypes_function(
+    "llama_token_fim_suf",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
+@ctypes_function(
+    "llama_token_fim_mid",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
+@ctypes_function(
+    "llama_token_fim_pad",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
+@ctypes_function(
+    "llama_token_fim_rep",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
+@ctypes_function(
+    "llama_token_fim_sep",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+# // CLS is equivalent to BOS
+# DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
+#         "use llama_vocab_bos instead");
+@ctypes_function(
+    "llama_vocab_cls",
+    [llama_vocab_p_ctypes],
+    llama_token,
+)
+def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
+    ...
+
+
 # //
 # // Tokenization
 # //
 # // The API is thread-safe.
 # //
 
-
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 # /// @return Returns the number of tokens on success, no more than n_tokens_max
 # /// @return Returns a negative number on failure - the number of tokens that would have been returned
+# /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
 # /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
 # /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
 # ///                      as plaintext. Does not insert a leading space.
 # LLAMA_API int32_t llama_tokenize(
-#     const struct llama_model * model,
+#     const struct llama_vocab * vocab,
 #                   const char * text,
 #                      int32_t   text_len,
 #                  llama_token * tokens,
@@ -2761,7 +3339,7 @@ def llama_token_fim_sep(model: llama_model_p, /) -> int:
 @ctypes_function(
     "llama_tokenize",
     [
-        llama_model_p_ctypes,
+        llama_vocab_p_ctypes,
         ctypes.c_char_p,
         ctypes.c_int32,
         llama_token_p,
@@ -2772,7 +3350,7 @@ def llama_token_fim_sep(model: llama_model_p, /) -> int:
     ctypes.c_int32,
 )
 def llama_tokenize(
-    model: llama_model_p,
+    vocab: llama_vocab_p,
     text: bytes,
     text_len: Union[ctypes.c_int, int],
     tokens: CtypesArray[llama_token],
@@ -2784,12 +3362,12 @@ def llama_tokenize(
     """Convert the provided text into tokens.
 
     Args:
-        model: The model to use for tokenization.
+        vocab: The vocabulary to use for tokenization.
         text: The text to tokenize.
         text_len: The length of the text.
         tokens: The tokens pointer must be large enough to hold the resulting tokens.
         n_max_tokens: The maximum number of tokens to return.
-        add_special: Allow adding special tokenns if the model is configured to do so.
+        add_special: Allow adding special tokens if the model is configured to do so.
         parse_special: Allow parsing special tokens.
 
     Returns:
@@ -2805,7 +3383,7 @@ def llama_tokenize(
 # // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
 # // @param special If true, special tokens are rendered in the output.
 # LLAMA_API int32_t llama_token_to_piece(
-#           const struct llama_model * model,
+#           const struct llama_vocab * vocab,
 #                        llama_token   token,
 #                               char * buf,
 #                            int32_t   length,
@@ -2814,7 +3392,7 @@ def llama_tokenize(
 @ctypes_function(
     "llama_token_to_piece",
     [
-        llama_model_p_ctypes,
+        llama_vocab_p_ctypes,
         llama_token,
         ctypes.c_char_p,
         ctypes.c_int32,
@@ -2824,7 +3402,7 @@ def llama_tokenize(
     ctypes.c_int32,
 )
 def llama_token_to_piece(
-    model: llama_model_p,
+    vocab: llama_vocab_p,
     token: Union[llama_token, int],
     buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
     length: Union[ctypes.c_int, int],
@@ -2838,7 +3416,7 @@ def llama_token_to_piece(
     User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
 
     Args:
-        model: The model to use for tokenization.
+        vocab: The vocabulary to use for tokenization.
         token: The token to convert.
         buf: The buffer to write the token to.
         length: The length of the buffer.
@@ -2847,23 +3425,6 @@ def llama_token_to_piece(
     ...
 
 
-# # // check if token0 is contained as a prefix in token1
-# # LLAMA_API bool llama_token_is_prefix(
-# #           const struct llama_model * model,
-# #                        llama_token   token0,
-# #                        llama_token   token1);
-# @ctypes_function(
-#     "llama_token_is_prefix",
-#     [llama_model_p_ctypes, llama_token, llama_token],
-#     ctypes.c_bool,
-# )
-# def llama_token_is_prefix(
-#     model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
-# ) -> bool:
-#     """Check if token0 is contained as a prefix in token1"""
-#     ...
-
-
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
 # /// @param text The char pointer must be large enough to hold the resulting text.
 # /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -2871,7 +3432,7 @@ def llama_token_to_piece(
 # /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
 # /// @param unparse_special If true, special tokens are rendered in the output.
 # LLAMA_API int32_t llama_detokenize(
-#     const struct llama_model * model,
+#     const struct llama_vocab * vocab,
 #            const llama_token * tokens,
 #                      int32_t   n_tokens,
 #                         char * text,
@@ -2881,7 +3442,7 @@ def llama_token_to_piece(
 @ctypes_function(
     "llama_detokenize",
     [
-        llama_model_p_ctypes,
+        llama_vocab_p_ctypes,
         ctypes.POINTER(llama_token),
         ctypes.c_int32,
         ctypes.c_char_p,
@@ -2892,7 +3453,7 @@ def llama_token_to_piece(
     ctypes.c_int32,
 )
 def llama_detokenize(
-    model: llama_model_p,
+    vocab: llama_vocab_p,
     tokens: CtypesArray[llama_token],
     n_tokens: Union[ctypes.c_int, int],
     text: bytes,
@@ -2904,7 +3465,7 @@ def llama_detokenize(
     """Convert the provided tokens into text (inverse of llama_tokenize()).
 
     Args:
-        model: The model to use for tokenization.
+        vocab: The vocabulary to use for tokenization.
         tokens: The tokens to convert.
         n_tokens: The number of tokens.
         text: The buffer to write the text to.
@@ -2918,11 +3479,10 @@ def llama_detokenize(
 # // Chat templates
 # //
 
-
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model's default chat template will be used instead.
 # /// @param chat Pointer to a list of multiple llama_chat_message
 # /// @param n_msg Number of llama_chat_message in this chat
 # /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
@@ -2930,7 +3490,6 @@ def llama_detokenize(
 # /// @param length The size of the allocated buffer
 # /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
 # LLAMA_API int32_t llama_chat_apply_template(
-#           const struct llama_model * model,
 #                         const char * tmpl,
 #    const struct llama_chat_message * chat,
 #                             size_t   n_msg,
@@ -2940,20 +3499,37 @@ def llama_detokenize(
 @ctypes_function(
     "llama_chat_apply_template",
     [
-        ctypes.c_void_p,
-        ctypes.c_char_p,
-        ctypes.POINTER(llama_chat_message),
-        ctypes.c_size_t,
+        ctypes.c_char_p,  # tmpl
+        ctypes.POINTER(llama_chat_message),  # chat
+        ctypes.c_size_t,  # n_msg
+        ctypes.c_bool,    # add_ass (added)
+        ctypes.c_char_p,  # buf
+        ctypes.c_int32,   # length
     ],
     ctypes.c_int32,
 )
 def llama_chat_apply_template(
-    model: llama_model_p,
     tmpl: bytes,
     chat: CtypesArray[llama_chat_message],
     n_msg: int,
+    add_ass: bool,  # Added parameter
+    buf: bytes,
+    length: int,
     /,
 ) -> int:
+    """Apply chat template.
+
+    Args:
+        tmpl: Template to use. If None, uses model's default
+        chat: Array of chat messages
+        n_msg: Number of messages
+        add_ass: Whether to end prompt with assistant token
+        buf: Output buffer
+        length: Buffer length
+
+    Returns:
+        Number of bytes written, or needed if buffer too small
+    """
     ...
 
 
@@ -2988,42 +3564,6 @@ def llama_chat_builtin_templates(
 # //
 # // Sampling API
 # //
-# // Sample usage:
-# //
-# //    // prepare the sampling chain at the start
-# //    auto sparams = llama_sampler_chain_default_params();
-# //
-# //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-# //
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
-# //
-# //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
-# //    // this sampler will be responsible to select the actual token
-# //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
-# //
-# //    ...
-# //
-# //    // decoding loop:
-# //    while (...) {
-# //        ...
-# //
-# //        llama_decode(ctx, batch);
-# //
-# //        // sample from the logits of the last token in the batch
-# //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
-# //
-# //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
-# //        llama_sampler_accept(smpl, id);
-# //        ...
-# //    }
-# //
-# //    llama_sampler_free(smpl);
-# //
-# // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
-# // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
-# //
 
 # typedef void * llama_sampler_context_t;
 llama_sampler_context_t = ctypes.c_void_p
@@ -3037,7 +3577,7 @@ def llama_chat_builtin_templates(
 #     void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
 #     struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 #     void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-#
+
 #     // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
 #     //void (*apply_ggml) (struct llama_sampler * smpl, ...);
 # };
@@ -3046,8 +3586,8 @@ class llama_sampler_i(ctypes.Structure):
 
 
 # struct llama_sampler {
-#     struct llama_sampler_i  * iface;
-#     llama_sampler_context_t   ctx;
+#     const struct llama_sampler_i * iface;
+#     llama_sampler_context_t        ctx;
 # };
 class llama_sampler(ctypes.Structure):
     _fields_ = [
@@ -3081,6 +3621,18 @@ class llama_sampler(ctypes.Structure):
 
 
 # // mirror of llama_sampler_i:
+# LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+@ctypes_function(
+    "llama_sampler_init",
+    [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init(
+    iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
+) -> llama_sampler_p:
+    ...
+
+
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
 @ctypes_function(
     "llama_sampler_name",
@@ -3146,7 +3698,7 @@ def llama_sampler_free(smpl: llama_sampler_p, /):
 
 # // llama_sampler_chain
 # // a type of llama_sampler that can chain multiple samplers one after another
-#
+
 # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
 @ctypes_function(
     "llama_sampler_chain_init",
@@ -3204,7 +3756,7 @@ def llama_sampler_chain_remove(
 
 
 # // available samplers:
-#
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
 def llama_sampler_init_greedy() -> llama_sampler_p:
@@ -3220,13 +3772,14 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+#     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
 def llama_sampler_init_softmax() -> llama_sampler_p:
     ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
 @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
 def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
@@ -3244,7 +3797,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
-# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+# /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
 # LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
 @ctypes_function(
     "llama_sampler_init_min_p",
@@ -3266,6 +3819,7 @@ def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
+# /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
 # LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes)
 def llama_sampler_init_temp(t: float) -> llama_sampler_p:
@@ -3298,12 +3852,18 @@ def llama_sampler_init_xtc(
     ...
 
 
+# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
+# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
+@ctypes_function(
+    "llama_sampler_init_top_n_sigma",
+    [ctypes.c_float],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
+    ...
+
+
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
 #                          int32_t   n_vocab,
 #                         uint32_t   seed,
@@ -3322,10 +3882,6 @@ def llama_sampler_init_mirostat(
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
 #                         uint32_t   seed,
 #                            float   tau,
@@ -3341,17 +3897,88 @@ def llama_sampler_init_mirostat_v2(
     ...
 
 
+# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
 # LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
-#         const struct llama_model * model,
+#         const struct llama_vocab * vocab,
 #                       const char * grammar_str,
 #                       const char * grammar_root);
 @ctypes_function(
     "llama_sampler_init_grammar",
-    [llama_model_p_ctypes, ctypes.c_char_p, ctypes.c_char_p],
+    [llama_vocab_p_ctypes, ctypes.c_char_p, ctypes.c_char_p],
     llama_sampler_p_ctypes,
 )
 def llama_sampler_init_grammar(
-    model: llama_model_p, grammar_str: bytes, grammar_root: bytes, /
+    vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, /
+) -> llama_sampler_p:
+    ...
+
+
+# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+#         const struct llama_vocab * vocab,
+#                       const char * grammar_str,
+#                       const char * grammar_root,
+#                      const char ** trigger_words,
+#                             size_t num_trigger_words,
+#                const llama_token * trigger_tokens,
+#                             size_t num_trigger_tokens),
+#     "use llama_sampler_init_grammar_lazy_patterns instead");
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        ctypes.POINTER(llama_token),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_words: CtypesArray[bytes],
+    num_trigger_words: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
+) -> llama_sampler_p:
+    ...
+
+
+# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+#     const struct llama_vocab * vocab,
+#                   const char * grammar_str,
+#                   const char * grammar_root,
+#                  const char ** trigger_patterns,
+#                         size_t num_trigger_patterns,
+#            const llama_token * trigger_tokens,
+#                         size_t num_trigger_tokens);
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy_patterns",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        ctypes.POINTER(llama_token),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy_patterns(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_patterns: CtypesArray[bytes],
+    num_trigger_patterns: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
 ) -> llama_sampler_p:
     ...
 
@@ -3379,7 +4006,8 @@ def llama_sampler_init_penalties(
 
 # ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
 # LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
-#         const struct llama_model *  model,
+#         const struct llama_vocab *  vocab,
+#                          int32_t    n_ctx_train,
 #                            float    dry_multiplier,
 #                            float    dry_base,
 #                          int32_t    dry_allowed_length,
@@ -3389,7 +4017,8 @@ def llama_sampler_init_penalties(
 @ctypes_function(
     "llama_sampler_init_dry",
     [
-        llama_model_p_ctypes,
+        llama_vocab_p_ctypes,
+        ctypes.c_int32,
         ctypes.c_float,
         ctypes.c_float,
         ctypes.c_int32,
@@ -3400,12 +4029,13 @@ def llama_sampler_init_penalties(
     llama_sampler_p_ctypes,
 )
 def llama_sampler_init_dry(
-    model: llama_model_p,
+    vocab: llama_vocab_p,
+    n_ctx_train: int,
     dry_multiplier: float,
     dry_base: float,
     dry_allowed_length: int,
     dry_penalty_last_n: int,
-    seq_breakers: CtypesArray[bytes],
+    seq_breakers,
     num_breakers: int,
     /,
 ) -> llama_sampler_p:
@@ -3428,35 +4058,13 @@ def llama_sampler_init_logit_bias(
 
 
 # // this sampler is meant to be used for fill-in-the-middle infilling
-# // it's supposed to be used after top_k + top_p sampling
-# //
-# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-# // 2. combine probs of tokens that have the same prefix
-# //
-# // example:
-# //
-# // - before:
-# //   "hel":   0.5
-# //   "hell":  0.2
-# //   "hello": 0.1
-# //   "dummy": 0.1
-# //
-# // - after:
-# //   "hel":   0.8
-# //   "dummy": 0.1
-# //
-# // 3. discard non-EOG tokens with low prob
-# // 4. if no tokens are left -> pick EOT
-# //
-# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
+# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_sampler_init_infill",
-    [llama_model_p_ctypes],
+    [llama_vocab_p_ctypes],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
-    """This sampler is meant to be used for fill-in-the-middle infilling.
-    """
+def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p:
     ...
 
 
@@ -3472,15 +4080,6 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
 
 
 # /// @details Sample and accept a token from the idx-th output of the last evaluation
-# //
-# // Shorthand for:
-# //    const auto * logits = llama_get_logits_ith(ctx, idx);
-# //    llama_token_data_array cur_p = { ... init from logits ... };
-# //    llama_sampler_apply(smpl, &cur_p);
-# //    auto token = cur_p.data[cur_p.selected].id;
-# //    llama_sampler_accept(smpl, token);
-# //    return token;
-# // Returns the sampled token
 # LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
 @ctypes_function(
     "llama_sampler_sample",
@@ -3497,10 +4096,7 @@ def llama_sampler_sample(
 # // Model split
 # //
 
-
 # /// @details Build a split GGUF final path for this chunk.
-# ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
-# //  Returns the split_path length.
 # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
 @ctypes_function(
     "llama_split_path",
@@ -3520,8 +4116,6 @@ def llama_split_path(
 
 
 # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-# ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-# //  Returns the split_prefix length.
 # LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
 @ctypes_function(
     "llama_split_prefix",
@@ -3569,16 +4163,13 @@ def llama_log_set(
 # //
 # // Performance utils
 # //
-# // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
-# //
-
 
 # struct llama_perf_context_data {
 #     double t_start_ms;
 #     double t_load_ms;
 #     double t_p_eval_ms;
 #     double t_eval_ms;
-#
+
 #     int32_t n_p_eval;
 #     int32_t n_eval;
 # };
@@ -3595,7 +4186,7 @@ class llama_perf_context_data(ctypes.Structure):
 
 # struct llama_perf_sampler_data {
 #     double t_sample_ms;
-#
+
 #     int32_t n_sample;
 # };
 class llama_perf_sampler_data(ctypes.Structure):
@@ -3666,3 +4257,83 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     ...
 
 
+# //
+# // training
+# //
+
+# // function that returns whether or not a given tensor contains trainable parameters
+# typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
+llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
+
+# // always returns true
+# LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+@ctypes_function(
+    "llama_opt_param_filter_all",
+    [ctypes.c_void_p, ctypes.c_void_p],
+    ctypes.c_bool,
+)
+def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool:
+    ...
+
+
+# struct llama_opt_params {
+#     uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+#     llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+#     void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
+
+#     ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+#     void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+# };
+class llama_opt_params(ctypes.Structure):
+    _fields_ = [
+        ("n_ctx_train", ctypes.c_uint32),
+        ("param_filter", llama_opt_param_filter),
+        ("param_filter_ud", ctypes.c_void_p),
+        ("get_opt_pars", ctypes.c_void_p),  # ggml_opt_get_optimizer_params - not implemented here
+        ("get_opt_pars_ud", ctypes.c_void_p),
+    ]
+
+
+# LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
+@ctypes_function(
+    "llama_opt_init",
+    [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params],
+    None,
+)
+def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /):
+    ...
+
+
+# LLAMA_API void llama_opt_epoch(
+#         struct llama_context    * lctx,
+#         ggml_opt_dataset_t        dataset,
+#         ggml_opt_result_t         result_train,
+#         ggml_opt_result_t         result_eval,
+#         int64_t                   idata_split,
+#         ggml_opt_epoch_callback   callback_train,
+#         ggml_opt_epoch_callback   callback_eval);
+@ctypes_function(
+    "llama_opt_epoch",
+    [
+        llama_context_p_ctypes,
+        ctypes.c_void_p,  # ggml_opt_dataset_t
+        ctypes.c_void_p,  # ggml_opt_result_t  
+        ctypes.c_void_p,  # ggml_opt_result_t
+        ctypes.c_int64,
+        ctypes.c_void_p,  # ggml_opt_epoch_callback
+        ctypes.c_void_p,  # ggml_opt_epoch_callback
+    ],
+    None,
+)
+def llama_opt_epoch(
+    lctx: llama_context_p,
+    dataset: ctypes.c_void_p,
+    result_train: ctypes.c_void_p,
+    result_eval: ctypes.c_void_p,
+    idata_split: int,
+    callback_train: ctypes.c_void_p,
+    callback_eval: ctypes.c_void_p,
+    /,
+):
+    ...
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
new file mode 100644
index 000000000..a45f8f406
--- /dev/null
+++ b/llama_cpp/mtmd_cpp.py
@@ -0,0 +1,280 @@
+from __future__ import annotations
+
+import os
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_uint8,
+    c_uint32,
+    c_float,
+    c_void_p,
+    c_size_t,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    byref,
+)
+import pathlib
+from typing import (
+    Union,
+    NewType,
+    Optional,
+    TYPE_CHECKING,
+)
+
+import llama_cpp.llama_cpp as llama_cpp
+
+from llama_cpp._ctypes_extensions import (
+    load_shared_library,
+    ctypes_function_for_shared_library,
+)
+
+if TYPE_CHECKING:
+    from llama_cpp._ctypes_extensions import (
+        CtypesArray,
+    )
+
+
+# Specify the base name of the shared library to load
+_libmtmd_base_name = "mtmd"
+_libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
+_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+
+# Load the library
+_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
+
+ctypes_function = ctypes_function_for_shared_library(_libmtmd)
+
+################################################
+# mtmd.h types
+################################################
+
+# Opaque types
+mtmd_context_p = NewType("mtmd_context_p", int)
+mtmd_context_p_ctypes = c_void_p
+
+mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
+mtmd_bitmap_p_ctypes = c_void_p
+
+mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
+mtmd_image_tokens_p_ctypes = c_void_p
+
+mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
+mtmd_input_chunk_p_ctypes = c_void_p
+
+mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
+mtmd_input_chunks_p_ctypes = c_void_p
+
+# Enums
+MTMD_INPUT_CHUNK_TYPE_TEXT = 0
+MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
+MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
+
+# Structures
+class mtmd_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("verbosity", c_int),  # ggml_log_level
+        ("image_marker", c_char_p),
+        ("media_marker", c_char_p),
+    ]
+
+class mtmd_input_text(Structure):
+    _fields_ = [
+        ("text", c_char_p),
+        ("add_special", c_bool),
+        ("parse_special", c_bool),
+    ]
+
+################################################
+# mtmd.h functions
+################################################
+
+# MTMD_API const char * mtmd_default_marker(void);
+@ctypes_function("mtmd_default_marker", [], c_char_p)
+def mtmd_default_marker() -> bytes:
+    ...
+
+# MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+@ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
+def mtmd_context_params_default() -> mtmd_context_params:
+    ...
+
+# MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+#                                             const struct llama_model * text_model,
+#                                             const struct mtmd_context_params ctx_params);
+@ctypes_function(
+    "mtmd_init_from_file",
+    [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
+    mtmd_context_p_ctypes
+)
+def mtmd_init_from_file(
+    mmproj_fname: bytes,
+    text_model: llama_cpp.llama_model_p,
+    ctx_params: mtmd_context_params,
+    /,
+) -> Optional[mtmd_context_p]:
+    ...
+
+# MTMD_API void mtmd_free(mtmd_context * ctx);
+@ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
+def mtmd_free(ctx: mtmd_context_p, /):
+    ...
+
+# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+@ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
+    ...
+
+# MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
+@ctypes_function(
+    "mtmd_bitmap_init",
+    [c_uint32, c_uint32, POINTER(c_uint8)],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_bitmap_init(
+    nx: Union[c_uint32, int],
+    ny: Union[c_uint32, int],
+    data: CtypesArray[c_uint8],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+
+# MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
+    ...
+
+# MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
+@ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
+    ...
+
+# MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
+    ...
+
+# MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+
+# MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
+@ctypes_function(
+    "mtmd_input_chunks_get",
+    [mtmd_input_chunks_p_ctypes, c_size_t],
+    mtmd_input_chunk_p_ctypes
+)
+def mtmd_input_chunks_get(
+    chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
+) -> Optional[mtmd_input_chunk_p]:
+    ...
+
+# MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+#                                mtmd_input_chunks * output,
+#                                const mtmd_input_text * text,
+#                                const mtmd_bitmap ** bitmaps,
+#                                size_t n_bitmaps);
+@ctypes_function(
+    "mtmd_tokenize",
+    [
+        mtmd_context_p_ctypes,
+        mtmd_input_chunks_p_ctypes,
+        POINTER(mtmd_input_text),
+        POINTER(mtmd_bitmap_p_ctypes),
+        c_size_t,
+    ],
+    c_int,
+)
+def mtmd_tokenize(
+    ctx: mtmd_context_p,
+    output: mtmd_input_chunks_p,
+    text: "_Pointer[mtmd_input_text]",
+    bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
+    n_bitmaps: Union[c_size_t, int],
+    /,
+) -> int:
+    ...
+
+# MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+
+# MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+
+# MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+@ctypes_function(
+    "mtmd_input_chunk_get_tokens_text",
+    [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
+    POINTER(llama_cpp.llama_token)
+)
+def mtmd_input_chunk_get_tokens_text(
+    chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
+) -> Optional["_Pointer[llama_cpp.llama_token]"]:
+    ...
+
+################################################
+# mtmd-helper.h functions
+################################################
+
+# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+@ctypes_function(
+    "mtmd_helper_bitmap_init_from_buf",
+    [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_helper_bitmap_init_from_buf(
+    ctx: mtmd_context_p,
+    buf: CtypesArray[c_uint8],
+    length: Union[c_size_t, int],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+
+# MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+
+# MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+#                                                struct llama_context * lctx,
+#                                                const mtmd_input_chunk * chunk,
+#                                                llama_pos n_past,
+#                                                llama_seq_id seq_id,
+#                                                int32_t n_batch,
+#                                                bool logits_last,
+#                                                llama_pos * new_n_past);
+@ctypes_function(
+    "mtmd_helper_eval_chunk_single",
+    [
+        mtmd_context_p_ctypes,
+        llama_cpp.llama_context_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+        llama_cpp.llama_pos,
+        llama_cpp.llama_seq_id,
+        c_int,
+        c_bool,
+        POINTER(llama_cpp.llama_pos),
+    ],
+    c_int,
+)
+def mtmd_helper_eval_chunk_single(
+    ctx: mtmd_context_p,
+    lctx: llama_cpp.llama_context_p,
+    chunk: mtmd_input_chunk_p,
+    n_past: llama_cpp.llama_pos,
+    seq_id: llama_cpp.llama_seq_id,
+    n_batch: Union[c_int, int],
+    logits_last: Union[c_bool, bool],
+    new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    /,
+) -> int:
+    ...
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index c6716f919..11bd363b5 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
+        elif settings.chat_format == "qwen2.5-vl":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
                 settings.hf_pretrained_model_name_or_path is not None
diff --git a/tests/test_llama.py b/tests/test_llama.py
index fc182ae20..0a1a9f5ad 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -216,3 +216,19 @@ def logit_processor_func(input_ids, logits):
 
     assert number_1 != number_2
     assert number_1 == number_3
+
+
+def test_real_llama_embeddings(llama_cpp_model_path):
+    model = llama_cpp.Llama(
+        llama_cpp_model_path,
+        n_ctx=32,
+        n_batch=32,
+        n_ubatch=32,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        flash_attn=True,
+        embedding=True
+    )
+    # Smoke test for now
+    model.embed("Hello World")
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f7cd13301..a0374a67e 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f7cd13301c2a88f97073fd119072b4cc92c08df1
+Subproject commit a0374a67e2924f2e845cdc59dd67d9a44065a89c