jooray
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎CHANGELOG.md
+7-1 b/‎CHANGELOG.md
+7-1
diff --git a/‎llama_cpp/__init__.py
+1-1 b/‎llama_cpp/__init__.py
+1-1
diff --git a/‎llama_cpp/llama.py
+115-79 b/‎llama_cpp/llama.py
+115-79
@@ -1,3 +1,5 @@
+.python-version
+
 .vscode/
 
 _skbuild/
 
@@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
+## [0.2.7]
+
+- Update llama.cpp to a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
+- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
+- Add openai-processing-ms to server response header by @Tradunsky in #748
+- Bump minimum version of scikit-build-core to 0.5.1 to fix msvc cmake issue by @abetlen in 1ed0f3ebe16993a0f961155aa4b2c85f1c68f668
+- Update `llama_types.py` to better match the openai api, old names are aliased to new ones by @abetlen in dbca136feaaf7f8b1182c4c3c90c32918b1d0bb3
 
 ## [0.2.6]
 
 
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.6"
+__version__ = "0.2.7"
@@ -216,30 +216,36 @@ def __init__(
         self,
         model_path: str,
         *,
-        # NOTE: These parameters are likely to change in the future.
-        seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
-        n_ctx: int = 512,
-        n_batch: int = 512,
+        # Model Params
         n_gpu_layers: int = 0,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
+        vocab_only: bool = False,
+        use_mmap: bool = True,
+        use_mlock: bool = False,
+        # Context Params
+        seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
+        n_ctx: int = 512,
+        n_batch: int = 512,
+        n_threads: Optional[int] = None,
+        n_threads_batch: Optional[int] = None,
         rope_freq_base: float = 10000.0,
         rope_freq_scale: float = 1.0,
-        low_vram: bool = False,
         mul_mat_q: bool = True,
         f16_kv: bool = True,
         logits_all: bool = False,
-        vocab_only: bool = False,
-        use_mmap: bool = True,
-        use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = None,
+        # Sampling Params
         last_n_tokens_size: int = 64,
+        # LoRA Params
         lora_base: Optional[str] = None,
+        lora_scale: float = 1.0,
         lora_path: Optional[str] = None,
+        # Backend Params
         numa: bool = False,
-        chat_completion_template: Optional["ChatCompletionFormat"] = None,
+        # Misc
         verbose: bool = True,
+        # Extra Params
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
@@ -279,79 +285,88 @@ def __init__(
 
         self.verbose = verbose
 
+        self.numa = numa
         if not Llama.__backend_initialized:
             if self.verbose:
-                llama_cpp.llama_backend_init(numa)
+                llama_cpp.llama_backend_init(self.numa)
             else:
                 with suppress_stdout_stderr():
-                    llama_cpp.llama_backend_init(numa)
+                    llama_cpp.llama_backend_init(self.numa)
             Llama.__backend_initialized = True
 
         self.model_path = model_path
 
-        self.params = llama_cpp.llama_context_default_params()
-        self.params.seed = seed
-        self.params.n_ctx = n_ctx
-        self.params.n_gpu_layers = (
+        # Model Params
+        self.model_params = llama_cpp.llama_model_default_params()
+        self.model_params.n_gpu_layers = (
             0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
-        self.params.main_gpu = main_gpu
-        self.params.rope_freq_base = rope_freq_base
-        self.params.rope_freq_scale = rope_freq_scale
-        self.params.low_vram = low_vram
-        self.params.mul_mat_q = mul_mat_q
-        self.params.f16_kv = f16_kv
-        self.params.logits_all = logits_all
-        self.params.vocab_only = vocab_only
-        self.params.use_mmap = use_mmap if lora_path is None else False
-        self.params.use_mlock = use_mlock
-        self.params.embedding = embedding
-
+        self.model_params.main_gpu = main_gpu
         self.tensor_split = tensor_split
         self._p_tensor_split = None
-
         if self.tensor_split is not None:
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
             self._c_tensor_split = FloatArray(
-                *tensor_split
+                *tensor_split  # type: ignore
             )  # keep a reference to the array so it is not gc'd
-            self.params.tensor_split = self._c_tensor_split
+            self.model_params.tensor_split = self._c_tensor_split
+        self.model_params.vocab_only = vocab_only
+        self.model_params.use_mmap = use_mmap if lora_path is None else False
+        self.model_params.use_mlock = use_mlock
+
+        self.n_batch = min(n_ctx, n_batch)  # ???
+        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
+        self.n_threads_batch = n_threads_batch or max(
+            multiprocessing.cpu_count() // 2, 1
+        )
 
+        # Context Params
+        self.context_params = llama_cpp.llama_context_default_params()
+        self.context_params.seed = seed
+        self.context_params.n_ctx = n_ctx
+        self.context_params.n_batch = self.n_batch
+        self.context_params.n_threads = self.n_threads
+        self.context_params.n_threads_batch = self.n_threads_batch
+        self.context_params.rope_freq_base = rope_freq_base
+        self.context_params.rope_freq_scale = rope_freq_scale
+        self.context_params.mul_mat_q = mul_mat_q
+        self.context_params.f16_kv = f16_kv
+        self.context_params.logits_all = logits_all
+        self.context_params.embedding = embedding
+
+        # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
-        self.n_batch = min(n_ctx, n_batch)
 
-        self.chat_completion_template = (
-            chat_completion_template or DefaultChatCompletionFormat()
-        )
 
         self.cache: Optional[BaseLlamaCache] = None
 
-        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
-
         self.lora_base = lora_base
+        self.lora_scale = lora_scale
         self.lora_path = lora_path
 
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
         if verbose:
             self.model = llama_cpp.llama_load_model_from_file(
-                self.model_path.encode("utf-8"), self.params
+                self.model_path.encode("utf-8"), self.model_params
             )
         else:
             with suppress_stdout_stderr():
                 self.model = llama_cpp.llama_load_model_from_file(
-                    self.model_path.encode("utf-8"), self.params
+                    self.model_path.encode("utf-8"), self.model_params
                 )
         assert self.model is not None
 
         if verbose:
-            self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
+            self.ctx = llama_cpp.llama_new_context_with_model(
+                self.model, self.context_params
+            )
         else:
             with suppress_stdout_stderr():
                 self.ctx = llama_cpp.llama_new_context_with_model(
-                    self.model, self.params
+                    self.model, self.context_params
                 )
 
         assert self.ctx is not None
@@ -360,6 +375,7 @@ def __init__(
             if llama_cpp.llama_model_apply_lora_from_file(
                 self.model,
                 self.lora_path.encode("utf-8"),
+                self.lora_scale,
                 self.lora_base.encode("utf-8")
                 if self.lora_base is not None
                 else llama_cpp.c_char_p(0),
@@ -416,7 +432,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen=self._n_ctx if self.params.logits_all else 1,
+            maxlen=self._n_ctx if self.model_params.logits_all else 1,
         )
 
     def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
@@ -434,7 +450,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         assert self.model is not None
         n_ctx = self._n_ctx
         tokens = (llama_cpp.llama_token * n_ctx)()
-        n_tokens = llama_cpp.llama_tokenize_with_model(
+        n_tokens = llama_cpp.llama_tokenize(
             self.model,
             text,
             len(text),
@@ -445,7 +461,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
             tokens = (llama_cpp.llama_token * n_tokens)()
-            n_tokens = llama_cpp.llama_tokenize_with_model(
+            n_tokens = llama_cpp.llama_tokenize(
                 self.model,
                 text,
                 len(text),
@@ -473,7 +489,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
         size = 32
         buffer = (ctypes.c_char * size)()
         for token in tokens:
-            n = llama_cpp.llama_token_to_piece_with_model(
+            n = llama_cpp.llama_token_to_piece(
                 self.model, llama_cpp.llama_token(token), buffer, size
             )
             assert n <= size
@@ -513,17 +529,16 @@ def eval(self, tokens: Sequence[int]):
                 tokens=(llama_cpp.llama_token * len(batch))(*batch),
                 n_tokens=n_tokens,
                 n_past=n_past,
-                n_threads=self.n_threads,
             )
             if return_code != 0:
                 raise RuntimeError(f"llama_eval returned {return_code}")
             # Save tokens
             self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
             # Save logits
-            rows = n_tokens if self.params.logits_all else 1
+            rows = n_tokens if self.context_params.logits_all else 1
             cols = self._n_vocab
             offset = (
-                0 if self.params.logits_all else n_tokens - 1
+                0 if self.context_params.logits_all else n_tokens - 1
             )  # NOTE: Only save the last token logits if logits_all is False
             self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
                 -1
@@ -807,7 +822,7 @@ def generate(
 
     def create_embedding(
         self, input: Union[str, List[str]], model: Optional[str] = None
-    ) -> Embedding:
+    ) -> CreateEmbeddingResponse:
         """Embed a string.
 
         Args:
@@ -819,7 +834,7 @@ def create_embedding(
         assert self.ctx is not None
         model_name: str = model if model is not None else self.model_path
 
-        if self.params.embedding == False:
+        if self.model_params.embedding == False:
             raise RuntimeError(
                 "Llama model must be created with embedding=True to call this method"
             )
@@ -941,7 +956,7 @@ def _create_completion(
         else:
             stop_sequences = []
 
-        if logprobs is not None and self.params.logits_all is False:
+        if logprobs is not None and self.model_params.logits_all is False:
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
@@ -1632,47 +1647,68 @@ def __del__(self):
 
     def __getstate__(self):
         return dict(
-            verbose=self.verbose,
             model_path=self.model_path,
-            n_ctx=self.params.n_ctx,
-            n_gpu_layers=self.params.n_gpu_layers,
-            seed=self.params.seed,
-            f16_kv=self.params.f16_kv,
-            logits_all=self.params.logits_all,
-            vocab_only=self.params.vocab_only,
-            use_mmap=self.params.use_mmap,
-            use_mlock=self.params.use_mlock,
-            embedding=self.params.embedding,
-            low_vram=self.params.low_vram,
-            last_n_tokens_size=self.last_n_tokens_size,
+            # Model Params
+            n_gpu_layers=self.model_params.n_gpu_layers,
+            main_gpu=self.model_params.main_gpu,
+            tensor_split=self.tensor_split,
+            vocab_only=self.model_params.vocab_only,
+            use_mmap=self.model_params.use_mmap,
+            use_mlock=self.model_params.use_mlock,
+            # Context Params
+            seed=self.context_params.seed,
+            n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
-            n_threads=self.n_threads,
+            n_threads=self.context_params.n_threads,
+            n_threads_batch=self.context_params.n_threads_batch,
+            rope_freq_base=self.context_params.rope_freq_base,
+            rope_freq_scale=self.context_params.rope_freq_scale,
+            mul_mat_q=self.context_params.mul_mat_q,
+            f16_kv=self.context_params.f16_kv,
+            logits_all=self.context_params.logits_all,
+            embedding=self.context_params.embedding,
+            # Sampling Params
+            last_n_tokens_size=self.last_n_tokens_size,
+            # LoRA Params
             lora_base=self.lora_base,
+            lora_scale=self.lora_scale,
             lora_path=self.lora_path,
-            tensor_split=self.tensor_split,
-            mul_mat_q=self.params.mul_mat_q,
+            # Backend Params
+            numa=self.numa,
+            # Misc
+            verbose=self.verbose,
         )
 
     def __setstate__(self, state):
         self.__init__(
             model_path=state["model_path"],
-            n_ctx=state["n_ctx"],
+            # Model Params
             n_gpu_layers=state["n_gpu_layers"],
-            seed=state["seed"],
-            f16_kv=state["f16_kv"],
-            logits_all=state["logits_all"],
+            main_gpu=state["main_gpu"],
+            tensor_split=state["tensor_split"],
             vocab_only=state["vocab_only"],
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
-            embedding=state["embedding"],
-            low_vram=state["low_vram"],
-            n_threads=state["n_threads"],
+            # Context Params
+            seed=state["seed"],
+            n_ctx=state["n_ctx"],
             n_batch=state["n_batch"],
+            n_threads=state["n_threads"],
+            n_threads_batch=state["n_threads_batch"],
+            rope_freq_base=state["rope_freq_base"],
+            rope_freq_scale=state["rope_freq_scale"],
+            mul_mat_q=state["mul_mat_q"],
+            f16_kv=state["f16_kv"],
+            logits_all=state["logits_all"],
+            embedding=state["embedding"],
+            # Sampling Params
             last_n_tokens_size=state["last_n_tokens_size"],
+            # LoRA Params
             lora_base=state["lora_base"],
             lora_path=state["lora_path"],
-            tensor_split=state["tensor_split"],
-            mul_mat_q=state["mul_mat_q"],
+            # Backend Params
+            numa=state["numa"],
+            # Misc
             verbose=state["verbose"],
         )
 
@@ -1725,13 +1761,13 @@ def n_ctx(self) -> int:
 
     def n_embd(self) -> int:
         """Return the embedding size."""
-        assert self.ctx is not None
-        return llama_cpp.llama_n_embd(self.ctx)
+        assert self.model is not None
+        return llama_cpp.llama_n_embd(self.model)
 
     def n_vocab(self) -> int:
         """Return the vocabulary size."""
-        assert self.ctx is not None
-        return llama_cpp.llama_n_vocab(self.ctx)
+        assert self.model is not None
+        return llama_cpp.llama_n_vocab(self.model)
 
     def tokenizer(self) -> "LlamaTokenizer":
         """Return the tokenizer for this model."""
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+.python-version`
	`2`	`+`
`1`	`3`	`.vscode/`
`2`	`4`
`3`	`5`	`_skbuild/`