ExtReMLapin
diff --git a/‎CHANGELOG.md
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/low_level_api/common.py
Lines changed: 1 addition & 1 deletion b/‎examples/low_level_api/common.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/low_level_api/low_level_api_chat_cpp.py
Lines changed: 36 additions & 14 deletions b/‎examples/low_level_api/low_level_api_chat_cpp.py
Lines changed: 36 additions & 14 deletions
diff --git a/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/_utils.py
Lines changed: 9 additions & 11 deletions b/‎llama_cpp/_utils.py
Lines changed: 9 additions & 11 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 38 additions & 1 deletion b/‎llama_cpp/llama.py
Lines changed: 38 additions & 1 deletion
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.29]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
+- feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
+- feat: Implement GGUF metadata KV overrides by @phiharri in #1011
+- fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
+- fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
+- fix: Fix Pydantic model parsing by @DeNeutoy in #1087
+
 ## [0.2.28]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
 
@@ -106,7 +106,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
 
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-p", "--prompt", type=str, default=None, help="initial prompt",dest="prompt")
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
     parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
 
@@ -62,7 +62,7 @@ def __init__(self, params: GptParams) -> None:
 		self.multibyte_fix = []
 
 		# model load
-		self.lparams = llama_cpp.llama_context_default_params()
+		self.lparams = llama_cpp.llama_model_default_params()
 		self.lparams.n_ctx = self.params.n_ctx
 		self.lparams.n_parts = self.params.n_parts
 		self.lparams.seed = self.params.seed
@@ -72,7 +72,11 @@ def __init__(self, params: GptParams) -> None:
 
 		self.model = llama_cpp.llama_load_model_from_file(
 			self.params.model.encode("utf8"), self.lparams)
-		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
+
+		# Context Params.
+		self.cparams = llama_cpp.llama_context_default_params()
+
+		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
@@ -244,7 +248,7 @@ def __init__(self, params: GptParams) -> None:
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
-		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
+		_n = llama_cpp.llama_tokenize(self.model, prompt.encode("utf8", errors="ignore"), len(prompt), _arr, len(_arr), bos, False)
 		return _arr[:_n]
 
 	def set_color(self, c):
@@ -304,7 +308,7 @@ def generate(self):
 					self.n_past += n_eval"""
 
 				if (llama_cpp.llama_eval(
-					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
+					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 
@@ -332,7 +336,7 @@ def generate(self):
 				id = 0
 
 				logits = llama_cpp.llama_get_logits(self.ctx)
-				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+				n_vocab = llama_cpp.llama_n_vocab(self.model)
 
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
@@ -349,12 +353,20 @@ def generate(self):
 				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
-				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
-				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+				llama_cpp.llama_sample_repetition_penalties(
+					ctx=self.ctx,
+					candidates=candidates_p,
+					last_tokens_data = _arr,
+					penalty_last_n = last_n_repeat,
+					penalty_repeat = llama_cpp.c_float(self.params.repeat_penalty),
+					penalty_freq = llama_cpp.c_float(self.params.frequency_penalty),
+					penalty_present = llama_cpp.c_float(self.params.presence_penalty),
+				)
+				
+				# NOT PRESENT IN CURRENT VERSION ?
+				# llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
+				# 	_arr,
+				# 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
 				if not self.params.penalize_nl:
 					logits[llama_cpp.llama_token_nl()] = nl_logit
@@ -473,7 +485,7 @@ def exit(self):
 	def token_to_str(self, token_id: int) -> bytes:
 		size = 32
 		buffer = (ctypes.c_char * size)()
-		n = llama_cpp.llama_token_to_piece_with_model(
+		n = llama_cpp.llama_token_to_piece(
 			self.model, llama_cpp.llama_token(token_id), buffer, size)
 		assert n <= size
 		return bytes(buffer[:n])
@@ -532,6 +544,9 @@ def interact(self):
 			print(i,end="",flush=True)
 		self.params.input_echo = False
 
+        # Using string instead of tokens to check for antiprompt,
+		# It is more reliable than tokens for interactive mode.
+		generated_str = ""
 		while self.params.interactive:
 			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
@@ -546,6 +561,10 @@ def interact(self):
 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
+					generated_str += i
+					for ap in self.params.antiprompt:
+						if generated_str.endswith(ap):							
+							raise KeyboardInterrupt
 			except KeyboardInterrupt:
 				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
@@ -561,7 +580,7 @@ def interact(self):
 	time_now = datetime.now()
 	prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
 The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
 The transcript only includes text, it does not include markup like HTML and Markdown.
 
@@ -575,8 +594,11 @@ def interact(self):
 {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
-{USER_NAME}:"""
+{USER_NAME}:   """
+	
 	params = gpt_params_parse()
+	if params.prompt is None and params.file is None:
+		params.prompt = prompt
 
 	with LLaMAInteract(params) as m:
 		m.interact()
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.28"
+__version__ = "0.2.29"
@@ -1,11 +1,15 @@
 import os
 import sys
 
+import sys, traceback
+
+# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
+outnull_file = open(os.devnull, "w")
+errnull_file = open(os.devnull, "w")
 
 class suppress_stdout_stderr(object):
     # NOTE: these must be "saved" here to avoid exceptions when using
     #       this context manager inside of a __del__ method
-    open = open
     sys = sys
     os = os
 
@@ -21,9 +25,6 @@ def __enter__(self):
         if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
             return self  # Return the instance without making changes
 
-        self.outnull_file = self.open(self.os.devnull, "w")
-        self.errnull_file = self.open(self.os.devnull, "w")
-
         self.old_stdout_fileno_undup = self.sys.stdout.fileno()
         self.old_stderr_fileno_undup = self.sys.stderr.fileno()
 
@@ -33,11 +34,11 @@ def __enter__(self):
         self.old_stdout = self.sys.stdout
         self.old_stderr = self.sys.stderr
 
-        self.os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
-        self.os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
+        self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
 
-        self.sys.stdout = self.outnull_file
-        self.sys.stderr = self.errnull_file
+        self.sys.stdout = outnull_file
+        self.sys.stderr = errnull_file
         return self
 
     def __exit__(self, *_):
@@ -54,6 +55,3 @@ def __exit__(self, *_):
 
             self.os.close(self.old_stdout_fileno)
             self.os.close(self.old_stderr_fileno)
-
-            self.outnull_file.close()
-            self.errnull_file.close()
@@ -52,11 +52,13 @@ def __init__(
         *,
         # Model Params
         n_gpu_layers: int = 0,
+        split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -120,11 +122,13 @@ def __init__(
         Args:
             model_path: Path to the model.
             n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
-            main_gpu: The GPU that is used for scratch and small tensors.
+            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
+            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
+            kv_overrides: Key-value overrides for the model.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -170,6 +174,7 @@ def __init__(
         self.model_params.n_gpu_layers = (
             0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+        self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
         self.tensor_split = tensor_split
         self._c_tensor_split = None
@@ -188,6 +193,34 @@ def __init__(
         self.model_params.use_mmap = use_mmap if lora_path is None else False
         self.model_params.use_mlock = use_mlock
 
+        self.kv_overrides = kv_overrides
+        if kv_overrides is not None:
+            n_overrides = len(kv_overrides)
+            self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
+            self._kv_overrides_array_keys = []
+
+            for k, v in kv_overrides.items():
+                key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
+                self._kv_overrides_array_keys.append(key_buf)
+                self._kv_overrides_array[i].key = key_buf
+                if isinstance(v, int):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
+                    self._kv_overrides_array[i].value.int_value = v
+                elif isinstance(v, float):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
+                    self._kv_overrides_array[i].value.float_value = v
+                elif isinstance(v, bool):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
+                    self._kv_overrides_array[i].value.bool_value = v
+                else:
+                    raise ValueError(f"Unknown value type for {k}: {v}")
+
+            self._kv_overrides_array_sentinel_key = b'\0'
+
+            # null array sentinel
+            self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
+            self.model_params.kv_overrides = self._kv_overrides_array
+
         self.n_batch = min(n_ctx, n_batch)  # ???
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
         self.n_threads_batch = n_threads_batch or max(
@@ -1399,11 +1432,13 @@ def __getstate__(self):
             model_path=self.model_path,
             # Model Params
             n_gpu_layers=self.model_params.n_gpu_layers,
+            split_mode=self.model_params.split_mode,
             main_gpu=self.model_params.main_gpu,
             tensor_split=self.tensor_split,
             vocab_only=self.model_params.vocab_only,
             use_mmap=self.model_params.use_mmap,
             use_mlock=self.model_params.use_mlock,
+            kv_overrides=self.kv_overrides,
             # Context Params
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
@@ -1441,11 +1476,13 @@ def __setstate__(self, state):
             model_path=state["model_path"],
             # Model Params
             n_gpu_layers=state["n_gpu_layers"],
+            split_mode=state["split_mode"],
             main_gpu=state["main_gpu"],
             tensor_split=state["tensor_split"],
             vocab_only=state["vocab_only"],
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
+            kv_overrides=state["kv_overrides"],
             # Context Params
             seed=state["seed"],
             n_ctx=state["n_ctx"],