Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9c68382

Browse files
committed
Merge branch 'main' into configurable-chat-templates
2 parents 428b64e + a72efc7 commit 9c68382

File tree

10 files changed

+808
-364
lines changed

10 files changed

+808
-364
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.python-version
2+
13
.vscode/
24

35
_skbuild/

CHANGELOG.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10-
- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
10+
## [0.2.7]
11+
12+
- Update llama.cpp to a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
13+
- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
14+
- Add openai-processing-ms to server response header by @Tradunsky in #748
15+
- Bump minimum version of scikit-build-core to 0.5.1 to fix msvc cmake issue by @abetlen in 1ed0f3ebe16993a0f961155aa4b2c85f1c68f668
16+
- Update `llama_types.py` to better match the openai api, old names are aliased to new ones by @abetlen in dbca136feaaf7f8b1182c4c3c90c32918b1d0bb3
1117

1218
## [0.2.6]
1319

llama_cpp/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.6"
4+
__version__ = "0.2.7"

llama_cpp/llama.py

+115-79
Original file line numberDiff line numberDiff line change
@@ -216,30 +216,36 @@ def __init__(
216216
self,
217217
model_path: str,
218218
*,
219-
# NOTE: These parameters are likely to change in the future.
220-
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
221-
n_ctx: int = 512,
222-
n_batch: int = 512,
219+
# Model Params
223220
n_gpu_layers: int = 0,
224221
main_gpu: int = 0,
225222
tensor_split: Optional[List[float]] = None,
223+
vocab_only: bool = False,
224+
use_mmap: bool = True,
225+
use_mlock: bool = False,
226+
# Context Params
227+
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
228+
n_ctx: int = 512,
229+
n_batch: int = 512,
230+
n_threads: Optional[int] = None,
231+
n_threads_batch: Optional[int] = None,
226232
rope_freq_base: float = 10000.0,
227233
rope_freq_scale: float = 1.0,
228-
low_vram: bool = False,
229234
mul_mat_q: bool = True,
230235
f16_kv: bool = True,
231236
logits_all: bool = False,
232-
vocab_only: bool = False,
233-
use_mmap: bool = True,
234-
use_mlock: bool = False,
235237
embedding: bool = False,
236-
n_threads: Optional[int] = None,
238+
# Sampling Params
237239
last_n_tokens_size: int = 64,
240+
# LoRA Params
238241
lora_base: Optional[str] = None,
242+
lora_scale: float = 1.0,
239243
lora_path: Optional[str] = None,
244+
# Backend Params
240245
numa: bool = False,
241-
chat_completion_template: Optional["ChatCompletionFormat"] = None,
246+
# Misc
242247
verbose: bool = True,
248+
# Extra Params
243249
**kwargs, # type: ignore
244250
):
245251
"""Load a llama.cpp model from `model_path`.
@@ -279,79 +285,88 @@ def __init__(
279285

280286
self.verbose = verbose
281287

288+
self.numa = numa
282289
if not Llama.__backend_initialized:
283290
if self.verbose:
284-
llama_cpp.llama_backend_init(numa)
291+
llama_cpp.llama_backend_init(self.numa)
285292
else:
286293
with suppress_stdout_stderr():
287-
llama_cpp.llama_backend_init(numa)
294+
llama_cpp.llama_backend_init(self.numa)
288295
Llama.__backend_initialized = True
289296

290297
self.model_path = model_path
291298

292-
self.params = llama_cpp.llama_context_default_params()
293-
self.params.seed = seed
294-
self.params.n_ctx = n_ctx
295-
self.params.n_gpu_layers = (
299+
# Model Params
300+
self.model_params = llama_cpp.llama_model_default_params()
301+
self.model_params.n_gpu_layers = (
296302
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
297303
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
298-
self.params.main_gpu = main_gpu
299-
self.params.rope_freq_base = rope_freq_base
300-
self.params.rope_freq_scale = rope_freq_scale
301-
self.params.low_vram = low_vram
302-
self.params.mul_mat_q = mul_mat_q
303-
self.params.f16_kv = f16_kv
304-
self.params.logits_all = logits_all
305-
self.params.vocab_only = vocab_only
306-
self.params.use_mmap = use_mmap if lora_path is None else False
307-
self.params.use_mlock = use_mlock
308-
self.params.embedding = embedding
309-
304+
self.model_params.main_gpu = main_gpu
310305
self.tensor_split = tensor_split
311306
self._p_tensor_split = None
312-
313307
if self.tensor_split is not None:
314308
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
315309
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
316310
self._c_tensor_split = FloatArray(
317-
*tensor_split
311+
*tensor_split # type: ignore
318312
) # keep a reference to the array so it is not gc'd
319-
self.params.tensor_split = self._c_tensor_split
313+
self.model_params.tensor_split = self._c_tensor_split
314+
self.model_params.vocab_only = vocab_only
315+
self.model_params.use_mmap = use_mmap if lora_path is None else False
316+
self.model_params.use_mlock = use_mlock
317+
318+
self.n_batch = min(n_ctx, n_batch) # ???
319+
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
320+
self.n_threads_batch = n_threads_batch or max(
321+
multiprocessing.cpu_count() // 2, 1
322+
)
320323

324+
# Context Params
325+
self.context_params = llama_cpp.llama_context_default_params()
326+
self.context_params.seed = seed
327+
self.context_params.n_ctx = n_ctx
328+
self.context_params.n_batch = self.n_batch
329+
self.context_params.n_threads = self.n_threads
330+
self.context_params.n_threads_batch = self.n_threads_batch
331+
self.context_params.rope_freq_base = rope_freq_base
332+
self.context_params.rope_freq_scale = rope_freq_scale
333+
self.context_params.mul_mat_q = mul_mat_q
334+
self.context_params.f16_kv = f16_kv
335+
self.context_params.logits_all = logits_all
336+
self.context_params.embedding = embedding
337+
338+
# Sampling Params
321339
self.last_n_tokens_size = last_n_tokens_size
322-
self.n_batch = min(n_ctx, n_batch)
323340

324-
self.chat_completion_template = (
325-
chat_completion_template or DefaultChatCompletionFormat()
326-
)
327341

328342
self.cache: Optional[BaseLlamaCache] = None
329343

330-
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
331-
332344
self.lora_base = lora_base
345+
self.lora_scale = lora_scale
333346
self.lora_path = lora_path
334347

335348
if not os.path.exists(model_path):
336349
raise ValueError(f"Model path does not exist: {model_path}")
337350

338351
if verbose:
339352
self.model = llama_cpp.llama_load_model_from_file(
340-
self.model_path.encode("utf-8"), self.params
353+
self.model_path.encode("utf-8"), self.model_params
341354
)
342355
else:
343356
with suppress_stdout_stderr():
344357
self.model = llama_cpp.llama_load_model_from_file(
345-
self.model_path.encode("utf-8"), self.params
358+
self.model_path.encode("utf-8"), self.model_params
346359
)
347360
assert self.model is not None
348361

349362
if verbose:
350-
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
363+
self.ctx = llama_cpp.llama_new_context_with_model(
364+
self.model, self.context_params
365+
)
351366
else:
352367
with suppress_stdout_stderr():
353368
self.ctx = llama_cpp.llama_new_context_with_model(
354-
self.model, self.params
369+
self.model, self.context_params
355370
)
356371

357372
assert self.ctx is not None
@@ -360,6 +375,7 @@ def __init__(
360375
if llama_cpp.llama_model_apply_lora_from_file(
361376
self.model,
362377
self.lora_path.encode("utf-8"),
378+
self.lora_scale,
363379
self.lora_base.encode("utf-8")
364380
if self.lora_base is not None
365381
else llama_cpp.c_char_p(0),
@@ -416,7 +432,7 @@ def eval_tokens(self) -> Deque[int]:
416432
def eval_logits(self) -> Deque[List[float]]:
417433
return deque(
418434
self.scores[: self.n_tokens, :].tolist(),
419-
maxlen=self._n_ctx if self.params.logits_all else 1,
435+
maxlen=self._n_ctx if self.model_params.logits_all else 1,
420436
)
421437

422438
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
@@ -434,7 +450,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
434450
assert self.model is not None
435451
n_ctx = self._n_ctx
436452
tokens = (llama_cpp.llama_token * n_ctx)()
437-
n_tokens = llama_cpp.llama_tokenize_with_model(
453+
n_tokens = llama_cpp.llama_tokenize(
438454
self.model,
439455
text,
440456
len(text),
@@ -445,7 +461,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
445461
if n_tokens < 0:
446462
n_tokens = abs(n_tokens)
447463
tokens = (llama_cpp.llama_token * n_tokens)()
448-
n_tokens = llama_cpp.llama_tokenize_with_model(
464+
n_tokens = llama_cpp.llama_tokenize(
449465
self.model,
450466
text,
451467
len(text),
@@ -473,7 +489,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
473489
size = 32
474490
buffer = (ctypes.c_char * size)()
475491
for token in tokens:
476-
n = llama_cpp.llama_token_to_piece_with_model(
492+
n = llama_cpp.llama_token_to_piece(
477493
self.model, llama_cpp.llama_token(token), buffer, size
478494
)
479495
assert n <= size
@@ -513,17 +529,16 @@ def eval(self, tokens: Sequence[int]):
513529
tokens=(llama_cpp.llama_token * len(batch))(*batch),
514530
n_tokens=n_tokens,
515531
n_past=n_past,
516-
n_threads=self.n_threads,
517532
)
518533
if return_code != 0:
519534
raise RuntimeError(f"llama_eval returned {return_code}")
520535
# Save tokens
521536
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
522537
# Save logits
523-
rows = n_tokens if self.params.logits_all else 1
538+
rows = n_tokens if self.context_params.logits_all else 1
524539
cols = self._n_vocab
525540
offset = (
526-
0 if self.params.logits_all else n_tokens - 1
541+
0 if self.context_params.logits_all else n_tokens - 1
527542
) # NOTE: Only save the last token logits if logits_all is False
528543
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
529544
-1
@@ -807,7 +822,7 @@ def generate(
807822

808823
def create_embedding(
809824
self, input: Union[str, List[str]], model: Optional[str] = None
810-
) -> Embedding:
825+
) -> CreateEmbeddingResponse:
811826
"""Embed a string.
812827
813828
Args:
@@ -819,7 +834,7 @@ def create_embedding(
819834
assert self.ctx is not None
820835
model_name: str = model if model is not None else self.model_path
821836

822-
if self.params.embedding == False:
837+
if self.model_params.embedding == False:
823838
raise RuntimeError(
824839
"Llama model must be created with embedding=True to call this method"
825840
)
@@ -941,7 +956,7 @@ def _create_completion(
941956
else:
942957
stop_sequences = []
943958

944-
if logprobs is not None and self.params.logits_all is False:
959+
if logprobs is not None and self.model_params.logits_all is False:
945960
raise ValueError(
946961
"logprobs is not supported for models created with logits_all=False"
947962
)
@@ -1632,47 +1647,68 @@ def __del__(self):
16321647

16331648
def __getstate__(self):
16341649
return dict(
1635-
verbose=self.verbose,
16361650
model_path=self.model_path,
1637-
n_ctx=self.params.n_ctx,
1638-
n_gpu_layers=self.params.n_gpu_layers,
1639-
seed=self.params.seed,
1640-
f16_kv=self.params.f16_kv,
1641-
logits_all=self.params.logits_all,
1642-
vocab_only=self.params.vocab_only,
1643-
use_mmap=self.params.use_mmap,
1644-
use_mlock=self.params.use_mlock,
1645-
embedding=self.params.embedding,
1646-
low_vram=self.params.low_vram,
1647-
last_n_tokens_size=self.last_n_tokens_size,
1651+
# Model Params
1652+
n_gpu_layers=self.model_params.n_gpu_layers,
1653+
main_gpu=self.model_params.main_gpu,
1654+
tensor_split=self.tensor_split,
1655+
vocab_only=self.model_params.vocab_only,
1656+
use_mmap=self.model_params.use_mmap,
1657+
use_mlock=self.model_params.use_mlock,
1658+
# Context Params
1659+
seed=self.context_params.seed,
1660+
n_ctx=self.context_params.n_ctx,
16481661
n_batch=self.n_batch,
1649-
n_threads=self.n_threads,
1662+
n_threads=self.context_params.n_threads,
1663+
n_threads_batch=self.context_params.n_threads_batch,
1664+
rope_freq_base=self.context_params.rope_freq_base,
1665+
rope_freq_scale=self.context_params.rope_freq_scale,
1666+
mul_mat_q=self.context_params.mul_mat_q,
1667+
f16_kv=self.context_params.f16_kv,
1668+
logits_all=self.context_params.logits_all,
1669+
embedding=self.context_params.embedding,
1670+
# Sampling Params
1671+
last_n_tokens_size=self.last_n_tokens_size,
1672+
# LoRA Params
16501673
lora_base=self.lora_base,
1674+
lora_scale=self.lora_scale,
16511675
lora_path=self.lora_path,
1652-
tensor_split=self.tensor_split,
1653-
mul_mat_q=self.params.mul_mat_q,
1676+
# Backend Params
1677+
numa=self.numa,
1678+
# Misc
1679+
verbose=self.verbose,
16541680
)
16551681

16561682
def __setstate__(self, state):
16571683
self.__init__(
16581684
model_path=state["model_path"],
1659-
n_ctx=state["n_ctx"],
1685+
# Model Params
16601686
n_gpu_layers=state["n_gpu_layers"],
1661-
seed=state["seed"],
1662-
f16_kv=state["f16_kv"],
1663-
logits_all=state["logits_all"],
1687+
main_gpu=state["main_gpu"],
1688+
tensor_split=state["tensor_split"],
16641689
vocab_only=state["vocab_only"],
16651690
use_mmap=state["use_mmap"],
16661691
use_mlock=state["use_mlock"],
1667-
embedding=state["embedding"],
1668-
low_vram=state["low_vram"],
1669-
n_threads=state["n_threads"],
1692+
# Context Params
1693+
seed=state["seed"],
1694+
n_ctx=state["n_ctx"],
16701695
n_batch=state["n_batch"],
1696+
n_threads=state["n_threads"],
1697+
n_threads_batch=state["n_threads_batch"],
1698+
rope_freq_base=state["rope_freq_base"],
1699+
rope_freq_scale=state["rope_freq_scale"],
1700+
mul_mat_q=state["mul_mat_q"],
1701+
f16_kv=state["f16_kv"],
1702+
logits_all=state["logits_all"],
1703+
embedding=state["embedding"],
1704+
# Sampling Params
16711705
last_n_tokens_size=state["last_n_tokens_size"],
1706+
# LoRA Params
16721707
lora_base=state["lora_base"],
16731708
lora_path=state["lora_path"],
1674-
tensor_split=state["tensor_split"],
1675-
mul_mat_q=state["mul_mat_q"],
1709+
# Backend Params
1710+
numa=state["numa"],
1711+
# Misc
16761712
verbose=state["verbose"],
16771713
)
16781714

@@ -1725,13 +1761,13 @@ def n_ctx(self) -> int:
17251761

17261762
def n_embd(self) -> int:
17271763
"""Return the embedding size."""
1728-
assert self.ctx is not None
1729-
return llama_cpp.llama_n_embd(self.ctx)
1764+
assert self.model is not None
1765+
return llama_cpp.llama_n_embd(self.model)
17301766

17311767
def n_vocab(self) -> int:
17321768
"""Return the vocabulary size."""
1733-
assert self.ctx is not None
1734-
return llama_cpp.llama_n_vocab(self.ctx)
1769+
assert self.model is not None
1770+
return llama_cpp.llama_n_vocab(self.model)
17351771

17361772
def tokenizer(self) -> "LlamaTokenizer":
17371773
"""Return the tokenizer for this model."""

0 commit comments

Comments
 (0)