Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4442ff8

Browse files
shakalacaabetlen
andauthored
fix: error showing time spent in llama perf context print (abetlen#1898)
* feat: Sync with llama.cpp Add `no_perf` field to `llama_context_params` to optionally disable performance timing measurements. * fix: Display performance metrics by default --------- Co-authored-by: Andrei <[email protected]>
1 parent 14879c7 commit 4442ff8

File tree

2 files changed

+7
-0
lines changed

2 files changed

+7
-0
lines changed

llama_cpp/llama.py

+4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def __init__(
9494
offload_kqv: bool = True,
9595
flash_attn: bool = False,
9696
# Sampling Params
97+
no_perf: bool = False,
9798
last_n_tokens_size: int = 64,
9899
# LoRA Params
99100
lora_base: Optional[str] = None,
@@ -173,6 +174,7 @@ def __init__(
173174
embedding: Embedding mode only.
174175
offload_kqv: Offload K, Q, V to GPU.
175176
flash_attn: Use flash attention.
177+
no_perf: Measure performance timings.
176178
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
177179
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
178180
lora_path: Path to a LoRA file to apply to the model.
@@ -351,6 +353,7 @@ def __init__(
351353
if type_v is not None:
352354
self.context_params.type_v = type_v
353355
# Sampling Params
356+
self.context_params.no_perf = no_perf
354357
self.last_n_tokens_size = last_n_tokens_size
355358

356359
self.cache: Optional[BaseLlamaCache] = None
@@ -2093,6 +2096,7 @@ def __getstate__(self):
20932096
offload_kqv=self.context_params.offload_kqv,
20942097
flash_attn=self.context_params.flash_attn,
20952098
# Sampling Params
2099+
no_perf=self.context_params.no_perf,
20962100
last_n_tokens_size=self.last_n_tokens_size,
20972101
# LoRA Params
20982102
lora_base=self.lora_base,

llama_cpp/llama_cpp.py

+3
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,7 @@ class llama_context_params(ctypes.Structure):
780780
embeddings (bool): if true, extract embeddings (together with logits)
781781
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
782782
flash_attn (bool): whether to use flash attention
783+
no_perf (bool): whether to measure performance timings
783784
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
784785
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
785786
"""
@@ -810,6 +811,7 @@ class llama_context_params(ctypes.Structure):
810811
embeddings: bool
811812
offload_kqv: bool
812813
flash_attn: bool
814+
no_perf: bool
813815
abort_callback: Callable[[ctypes.c_void_p], bool]
814816
abort_callback_data: ctypes.c_void_p
815817

@@ -839,6 +841,7 @@ class llama_context_params(ctypes.Structure):
839841
("embeddings", ctypes.c_bool),
840842
("offload_kqv", ctypes.c_bool),
841843
("flash_attn", ctypes.c_bool),
844+
("no_perf", ctypes.c_bool),
842845
("abort_callback", ggml_abort_callback),
843846
("abort_callback_data", ctypes.c_void_p),
844847
]

0 commit comments

Comments
 (0)