@@ -772,7 +772,7 @@ class llama_context_params(ctypes.Structure):
772
772
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
773
773
type_k (int): data type for K cache
774
774
type_v (int): data type for V cache
775
- logits_all (bool): the llama_eval () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
775
+ logits_all (bool): the llama_decode () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
776
776
embeddings (bool): if true, extract embeddings (together with logits)
777
777
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
778
778
flash_attn (bool): whether to use flash attention
@@ -2469,10 +2469,10 @@ def llama_synchronize(ctx: llama_context_p, /):
2469
2469
"llama_get_logits" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
2470
2470
)
2471
2471
def llama_get_logits (ctx : llama_context_p , / ) -> CtypesArray [ctypes .c_float ]:
2472
- """Token logits obtained from the last call to llama_eval ()
2473
- The logits for the last token are stored in the last row
2474
- Logits for which llama_batch.logits[i] == 0 are undefined
2475
- Rows: n_tokens provided with llama_batch
2472
+ """Token logits obtained from the last call to llama_decode ()
2473
+ The logits for which llama_batch.logits[i] != 0 are stored contiguously
2474
+ in the order they have appeared in the batch.
2475
+ Rows: number of tokens for which llama_batch.logits[i] != 0
2476
2476
Cols: n_vocab
2477
2477
2478
2478
Returns:
0 commit comments