@@ -216,30 +216,36 @@ def __init__(
216
216
self ,
217
217
model_path : str ,
218
218
* ,
219
- # NOTE: These parameters are likely to change in the future.
220
- seed : int = llama_cpp .LLAMA_DEFAULT_SEED ,
221
- n_ctx : int = 512 ,
222
- n_batch : int = 512 ,
219
+ # Model Params
223
220
n_gpu_layers : int = 0 ,
224
221
main_gpu : int = 0 ,
225
222
tensor_split : Optional [List [float ]] = None ,
223
+ vocab_only : bool = False ,
224
+ use_mmap : bool = True ,
225
+ use_mlock : bool = False ,
226
+ # Context Params
227
+ seed : int = llama_cpp .LLAMA_DEFAULT_SEED ,
228
+ n_ctx : int = 512 ,
229
+ n_batch : int = 512 ,
230
+ n_threads : Optional [int ] = None ,
231
+ n_threads_batch : Optional [int ] = None ,
226
232
rope_freq_base : float = 10000.0 ,
227
233
rope_freq_scale : float = 1.0 ,
228
- low_vram : bool = False ,
229
234
mul_mat_q : bool = True ,
230
235
f16_kv : bool = True ,
231
236
logits_all : bool = False ,
232
- vocab_only : bool = False ,
233
- use_mmap : bool = True ,
234
- use_mlock : bool = False ,
235
237
embedding : bool = False ,
236
- n_threads : Optional [ int ] = None ,
238
+ # Sampling Params
237
239
last_n_tokens_size : int = 64 ,
240
+ # LoRA Params
238
241
lora_base : Optional [str ] = None ,
242
+ lora_scale : float = 1.0 ,
239
243
lora_path : Optional [str ] = None ,
244
+ # Backend Params
240
245
numa : bool = False ,
241
- chat_completion_template : Optional [ "ChatCompletionFormat" ] = None ,
246
+ # Misc
242
247
verbose : bool = True ,
248
+ # Extra Params
243
249
** kwargs , # type: ignore
244
250
):
245
251
"""Load a llama.cpp model from `model_path`.
@@ -279,79 +285,88 @@ def __init__(
279
285
280
286
self .verbose = verbose
281
287
288
+ self .numa = numa
282
289
if not Llama .__backend_initialized :
283
290
if self .verbose :
284
- llama_cpp .llama_backend_init (numa )
291
+ llama_cpp .llama_backend_init (self . numa )
285
292
else :
286
293
with suppress_stdout_stderr ():
287
- llama_cpp .llama_backend_init (numa )
294
+ llama_cpp .llama_backend_init (self . numa )
288
295
Llama .__backend_initialized = True
289
296
290
297
self .model_path = model_path
291
298
292
- self .params = llama_cpp .llama_context_default_params ()
293
- self .params .seed = seed
294
- self .params .n_ctx = n_ctx
295
- self .params .n_gpu_layers = (
299
+ # Model Params
300
+ self .model_params = llama_cpp .llama_model_default_params ()
301
+ self .model_params .n_gpu_layers = (
296
302
0x7FFFFFFF if n_gpu_layers == - 1 else n_gpu_layers
297
303
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
298
- self .params .main_gpu = main_gpu
299
- self .params .rope_freq_base = rope_freq_base
300
- self .params .rope_freq_scale = rope_freq_scale
301
- self .params .low_vram = low_vram
302
- self .params .mul_mat_q = mul_mat_q
303
- self .params .f16_kv = f16_kv
304
- self .params .logits_all = logits_all
305
- self .params .vocab_only = vocab_only
306
- self .params .use_mmap = use_mmap if lora_path is None else False
307
- self .params .use_mlock = use_mlock
308
- self .params .embedding = embedding
309
-
304
+ self .model_params .main_gpu = main_gpu
310
305
self .tensor_split = tensor_split
311
306
self ._p_tensor_split = None
312
-
313
307
if self .tensor_split is not None :
314
308
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
315
309
FloatArray = ctypes .c_float * llama_cpp .LLAMA_MAX_DEVICES
316
310
self ._c_tensor_split = FloatArray (
317
- * tensor_split
311
+ * tensor_split # type: ignore
318
312
) # keep a reference to the array so it is not gc'd
319
- self .params .tensor_split = self ._c_tensor_split
313
+ self .model_params .tensor_split = self ._c_tensor_split
314
+ self .model_params .vocab_only = vocab_only
315
+ self .model_params .use_mmap = use_mmap if lora_path is None else False
316
+ self .model_params .use_mlock = use_mlock
317
+
318
+ self .n_batch = min (n_ctx , n_batch ) # ???
319
+ self .n_threads = n_threads or max (multiprocessing .cpu_count () // 2 , 1 )
320
+ self .n_threads_batch = n_threads_batch or max (
321
+ multiprocessing .cpu_count () // 2 , 1
322
+ )
320
323
324
+ # Context Params
325
+ self .context_params = llama_cpp .llama_context_default_params ()
326
+ self .context_params .seed = seed
327
+ self .context_params .n_ctx = n_ctx
328
+ self .context_params .n_batch = self .n_batch
329
+ self .context_params .n_threads = self .n_threads
330
+ self .context_params .n_threads_batch = self .n_threads_batch
331
+ self .context_params .rope_freq_base = rope_freq_base
332
+ self .context_params .rope_freq_scale = rope_freq_scale
333
+ self .context_params .mul_mat_q = mul_mat_q
334
+ self .context_params .f16_kv = f16_kv
335
+ self .context_params .logits_all = logits_all
336
+ self .context_params .embedding = embedding
337
+
338
+ # Sampling Params
321
339
self .last_n_tokens_size = last_n_tokens_size
322
- self .n_batch = min (n_ctx , n_batch )
323
340
324
- self .chat_completion_template = (
325
- chat_completion_template or DefaultChatCompletionFormat ()
326
- )
327
341
328
342
self .cache : Optional [BaseLlamaCache ] = None
329
343
330
- self .n_threads = n_threads or max (multiprocessing .cpu_count () // 2 , 1 )
331
-
332
344
self .lora_base = lora_base
345
+ self .lora_scale = lora_scale
333
346
self .lora_path = lora_path
334
347
335
348
if not os .path .exists (model_path ):
336
349
raise ValueError (f"Model path does not exist: { model_path } " )
337
350
338
351
if verbose :
339
352
self .model = llama_cpp .llama_load_model_from_file (
340
- self .model_path .encode ("utf-8" ), self .params
353
+ self .model_path .encode ("utf-8" ), self .model_params
341
354
)
342
355
else :
343
356
with suppress_stdout_stderr ():
344
357
self .model = llama_cpp .llama_load_model_from_file (
345
- self .model_path .encode ("utf-8" ), self .params
358
+ self .model_path .encode ("utf-8" ), self .model_params
346
359
)
347
360
assert self .model is not None
348
361
349
362
if verbose :
350
- self .ctx = llama_cpp .llama_new_context_with_model (self .model , self .params )
363
+ self .ctx = llama_cpp .llama_new_context_with_model (
364
+ self .model , self .context_params
365
+ )
351
366
else :
352
367
with suppress_stdout_stderr ():
353
368
self .ctx = llama_cpp .llama_new_context_with_model (
354
- self .model , self .params
369
+ self .model , self .context_params
355
370
)
356
371
357
372
assert self .ctx is not None
@@ -360,6 +375,7 @@ def __init__(
360
375
if llama_cpp .llama_model_apply_lora_from_file (
361
376
self .model ,
362
377
self .lora_path .encode ("utf-8" ),
378
+ self .lora_scale ,
363
379
self .lora_base .encode ("utf-8" )
364
380
if self .lora_base is not None
365
381
else llama_cpp .c_char_p (0 ),
@@ -416,7 +432,7 @@ def eval_tokens(self) -> Deque[int]:
416
432
def eval_logits (self ) -> Deque [List [float ]]:
417
433
return deque (
418
434
self .scores [: self .n_tokens , :].tolist (),
419
- maxlen = self ._n_ctx if self .params .logits_all else 1 ,
435
+ maxlen = self ._n_ctx if self .model_params .logits_all else 1 ,
420
436
)
421
437
422
438
def tokenize (self , text : bytes , add_bos : bool = True ) -> List [int ]:
@@ -434,7 +450,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
434
450
assert self .model is not None
435
451
n_ctx = self ._n_ctx
436
452
tokens = (llama_cpp .llama_token * n_ctx )()
437
- n_tokens = llama_cpp .llama_tokenize_with_model (
453
+ n_tokens = llama_cpp .llama_tokenize (
438
454
self .model ,
439
455
text ,
440
456
len (text ),
@@ -445,7 +461,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
445
461
if n_tokens < 0 :
446
462
n_tokens = abs (n_tokens )
447
463
tokens = (llama_cpp .llama_token * n_tokens )()
448
- n_tokens = llama_cpp .llama_tokenize_with_model (
464
+ n_tokens = llama_cpp .llama_tokenize (
449
465
self .model ,
450
466
text ,
451
467
len (text ),
@@ -473,7 +489,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
473
489
size = 32
474
490
buffer = (ctypes .c_char * size )()
475
491
for token in tokens :
476
- n = llama_cpp .llama_token_to_piece_with_model (
492
+ n = llama_cpp .llama_token_to_piece (
477
493
self .model , llama_cpp .llama_token (token ), buffer , size
478
494
)
479
495
assert n <= size
@@ -513,17 +529,16 @@ def eval(self, tokens: Sequence[int]):
513
529
tokens = (llama_cpp .llama_token * len (batch ))(* batch ),
514
530
n_tokens = n_tokens ,
515
531
n_past = n_past ,
516
- n_threads = self .n_threads ,
517
532
)
518
533
if return_code != 0 :
519
534
raise RuntimeError (f"llama_eval returned { return_code } " )
520
535
# Save tokens
521
536
self .input_ids [self .n_tokens : self .n_tokens + n_tokens ] = batch
522
537
# Save logits
523
- rows = n_tokens if self .params .logits_all else 1
538
+ rows = n_tokens if self .context_params .logits_all else 1
524
539
cols = self ._n_vocab
525
540
offset = (
526
- 0 if self .params .logits_all else n_tokens - 1
541
+ 0 if self .context_params .logits_all else n_tokens - 1
527
542
) # NOTE: Only save the last token logits if logits_all is False
528
543
self .scores [self .n_tokens + offset : self .n_tokens + n_tokens , :].reshape (
529
544
- 1
@@ -807,7 +822,7 @@ def generate(
807
822
808
823
def create_embedding (
809
824
self , input : Union [str , List [str ]], model : Optional [str ] = None
810
- ) -> Embedding :
825
+ ) -> CreateEmbeddingResponse :
811
826
"""Embed a string.
812
827
813
828
Args:
@@ -819,7 +834,7 @@ def create_embedding(
819
834
assert self .ctx is not None
820
835
model_name : str = model if model is not None else self .model_path
821
836
822
- if self .params .embedding == False :
837
+ if self .model_params .embedding == False :
823
838
raise RuntimeError (
824
839
"Llama model must be created with embedding=True to call this method"
825
840
)
@@ -941,7 +956,7 @@ def _create_completion(
941
956
else :
942
957
stop_sequences = []
943
958
944
- if logprobs is not None and self .params .logits_all is False :
959
+ if logprobs is not None and self .model_params .logits_all is False :
945
960
raise ValueError (
946
961
"logprobs is not supported for models created with logits_all=False"
947
962
)
@@ -1632,47 +1647,68 @@ def __del__(self):
1632
1647
1633
1648
def __getstate__ (self ):
1634
1649
return dict (
1635
- verbose = self .verbose ,
1636
1650
model_path = self .model_path ,
1637
- n_ctx = self .params .n_ctx ,
1638
- n_gpu_layers = self .params .n_gpu_layers ,
1639
- seed = self .params .seed ,
1640
- f16_kv = self .params .f16_kv ,
1641
- logits_all = self .params .logits_all ,
1642
- vocab_only = self .params .vocab_only ,
1643
- use_mmap = self .params .use_mmap ,
1644
- use_mlock = self .params .use_mlock ,
1645
- embedding = self .params .embedding ,
1646
- low_vram = self .params .low_vram ,
1647
- last_n_tokens_size = self .last_n_tokens_size ,
1651
+ # Model Params
1652
+ n_gpu_layers = self .model_params .n_gpu_layers ,
1653
+ main_gpu = self .model_params .main_gpu ,
1654
+ tensor_split = self .tensor_split ,
1655
+ vocab_only = self .model_params .vocab_only ,
1656
+ use_mmap = self .model_params .use_mmap ,
1657
+ use_mlock = self .model_params .use_mlock ,
1658
+ # Context Params
1659
+ seed = self .context_params .seed ,
1660
+ n_ctx = self .context_params .n_ctx ,
1648
1661
n_batch = self .n_batch ,
1649
- n_threads = self .n_threads ,
1662
+ n_threads = self .context_params .n_threads ,
1663
+ n_threads_batch = self .context_params .n_threads_batch ,
1664
+ rope_freq_base = self .context_params .rope_freq_base ,
1665
+ rope_freq_scale = self .context_params .rope_freq_scale ,
1666
+ mul_mat_q = self .context_params .mul_mat_q ,
1667
+ f16_kv = self .context_params .f16_kv ,
1668
+ logits_all = self .context_params .logits_all ,
1669
+ embedding = self .context_params .embedding ,
1670
+ # Sampling Params
1671
+ last_n_tokens_size = self .last_n_tokens_size ,
1672
+ # LoRA Params
1650
1673
lora_base = self .lora_base ,
1674
+ lora_scale = self .lora_scale ,
1651
1675
lora_path = self .lora_path ,
1652
- tensor_split = self .tensor_split ,
1653
- mul_mat_q = self .params .mul_mat_q ,
1676
+ # Backend Params
1677
+ numa = self .numa ,
1678
+ # Misc
1679
+ verbose = self .verbose ,
1654
1680
)
1655
1681
1656
1682
def __setstate__ (self , state ):
1657
1683
self .__init__ (
1658
1684
model_path = state ["model_path" ],
1659
- n_ctx = state [ "n_ctx" ],
1685
+ # Model Params
1660
1686
n_gpu_layers = state ["n_gpu_layers" ],
1661
- seed = state ["seed" ],
1662
- f16_kv = state ["f16_kv" ],
1663
- logits_all = state ["logits_all" ],
1687
+ main_gpu = state ["main_gpu" ],
1688
+ tensor_split = state ["tensor_split" ],
1664
1689
vocab_only = state ["vocab_only" ],
1665
1690
use_mmap = state ["use_mmap" ],
1666
1691
use_mlock = state ["use_mlock" ],
1667
- embedding = state [ "embedding" ],
1668
- low_vram = state ["low_vram " ],
1669
- n_threads = state ["n_threads " ],
1692
+ # Context Params
1693
+ seed = state ["seed " ],
1694
+ n_ctx = state ["n_ctx " ],
1670
1695
n_batch = state ["n_batch" ],
1696
+ n_threads = state ["n_threads" ],
1697
+ n_threads_batch = state ["n_threads_batch" ],
1698
+ rope_freq_base = state ["rope_freq_base" ],
1699
+ rope_freq_scale = state ["rope_freq_scale" ],
1700
+ mul_mat_q = state ["mul_mat_q" ],
1701
+ f16_kv = state ["f16_kv" ],
1702
+ logits_all = state ["logits_all" ],
1703
+ embedding = state ["embedding" ],
1704
+ # Sampling Params
1671
1705
last_n_tokens_size = state ["last_n_tokens_size" ],
1706
+ # LoRA Params
1672
1707
lora_base = state ["lora_base" ],
1673
1708
lora_path = state ["lora_path" ],
1674
- tensor_split = state ["tensor_split" ],
1675
- mul_mat_q = state ["mul_mat_q" ],
1709
+ # Backend Params
1710
+ numa = state ["numa" ],
1711
+ # Misc
1676
1712
verbose = state ["verbose" ],
1677
1713
)
1678
1714
@@ -1725,13 +1761,13 @@ def n_ctx(self) -> int:
1725
1761
1726
1762
def n_embd (self ) -> int :
1727
1763
"""Return the embedding size."""
1728
- assert self .ctx is not None
1729
- return llama_cpp .llama_n_embd (self .ctx )
1764
+ assert self .model is not None
1765
+ return llama_cpp .llama_n_embd (self .model )
1730
1766
1731
1767
def n_vocab (self ) -> int :
1732
1768
"""Return the vocabulary size."""
1733
- assert self .ctx is not None
1734
- return llama_cpp .llama_n_vocab (self .ctx )
1769
+ assert self .model is not None
1770
+ return llama_cpp .llama_n_vocab (self .model )
1735
1771
1736
1772
def tokenizer (self ) -> "LlamaTokenizer" :
1737
1773
"""Return the tokenizer for this model."""
0 commit comments