From 344c106a6961b50f734b95f084fbf02057d4b475 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Mar 2025 04:11:42 -0400
Subject: [PATCH 1/3] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 74 ++++++++++++++++++++++++++++++++++++++++--
 vendor/llama.cpp       |  2 +-
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 205d89a0b..f3985ad2f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -227,6 +227,7 @@
 #     LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
 #     LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
 #     LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+#     LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -257,6 +258,7 @@
 LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
+LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -1357,6 +1359,12 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_head_kv(model: llama_model_p, /) -> int:
+    ...
+
+
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -3375,8 +3383,8 @@ class llama_sampler_i(ctypes.Structure):
 
 
 # struct llama_sampler {
-#     struct llama_sampler_i  * iface;
-#     llama_sampler_context_t   ctx;
+#     const struct llama_sampler_i  * iface;
+#     llama_sampler_context_t         ctx;
 # };
 class llama_sampler(ctypes.Structure):
     _fields_ = [
@@ -3410,6 +3418,18 @@ class llama_sampler(ctypes.Structure):
 
 
 # // mirror of llama_sampler_i:
+# LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+@ctypes_function(
+    "llama_sampler_init",
+    [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init(
+    iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
+) -> llama_sampler_p:
+    ...
+
+
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
 @ctypes_function(
     "llama_sampler_name",
@@ -3627,6 +3647,17 @@ def llama_sampler_init_xtc(
     ...
 
 
+# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
+# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
+@ctypes_function(
+    "llama_sampler_init_top_n_sigma",
+    [ctypes.c_float],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
+    ...
+
+
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
 # /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3685,6 +3716,43 @@ def llama_sampler_init_grammar(
     ...
 
 
+# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+# /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+# /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+#     const struct llama_vocab * vocab,
+#                   const char * grammar_str,
+#                   const char * grammar_root,
+#                  const char ** trigger_patterns,
+#                         size_t num_trigger_patterns,
+#            const llama_token * trigger_tokens,
+#                         size_t num_trigger_tokens);
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy_patterns",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        ctypes.POINTER(llama_token),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy_patterns(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_patterns: CtypesArray[bytes],
+    num_trigger_patterns: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
+) -> llama_sampler_p:
+    ...
+
+
 # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
 # LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
 #                          int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -3737,7 +3805,7 @@ def llama_sampler_init_dry(
     dry_base: float,
     dry_allowed_length: int,
     dry_penalty_last_n: int,
-    seq_breakers: CtypesArray[bytes],
+    seq_breakers,
     num_breakers: int,
     /,
 ) -> llama_sampler_p:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 794fe23f2..2c9f833d1 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 794fe23f29fb40104975c91fe19f23798f7c726e
+Subproject commit 2c9f833d17bb5b8ea89dec663b072b5420fc5438

From e232fae1a35c0bdaabf230a5d71780c0178e5d8f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Mar 2025 04:44:18 -0400
Subject: [PATCH 2/3] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2c9f833d1..7841fc723 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2c9f833d17bb5b8ea89dec663b072b5420fc5438
+Subproject commit 7841fc723e059d1fd9640e5c0ef19050fcc7c698

From 37eb5f0a4c2a8706b89ead1406b1577c4602cdec Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Mar 2025 05:30:21 -0400
Subject: [PATCH 3/3] chore: Bump version

---
 CHANGELOG.md          | 4 ++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 53365e368..605370e7d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.8]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698
+
 ## [0.3.7]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@794fe23f29fb40104975c91fe19f23798f7c726e
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index fc1fcbcf6..b1a8b9baa 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.7"
+__version__ = "0.3.8"