Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 51dce74

Browse files
committed
misc: Fix support for new parameters, deprecate rpc_servers parameter
1 parent 0d475d7 commit 51dce74

File tree

1 file changed

+19
-15
lines changed

1 file changed

+19
-15
lines changed

llama_cpp/llama.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def __init__(
6666
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
6767
main_gpu: int = 0,
6868
tensor_split: Optional[List[float]] = None,
69-
rpc_servers: Optional[str] = None,
7069
vocab_only: bool = False,
7170
use_mmap: bool = True,
7271
use_mlock: bool = False,
@@ -93,6 +92,8 @@ def __init__(
9392
embedding: bool = False,
9493
offload_kqv: bool = True,
9594
flash_attn: bool = False,
95+
op_offloat: bool | None = None,
96+
swa_full: bool | None = None,
9697
# Sampling Params
9798
no_perf: bool = False,
9899
last_n_tokens_size: int = 64,
@@ -150,7 +151,6 @@ def __init__(
150151
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
151152
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
152153
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
153-
rpc_servers: Comma separated list of RPC servers to use for offloading
154154
vocab_only: Only load the vocabulary no weights.
155155
use_mmap: Use mmap if possible.
156156
use_mlock: Force the system to keep the model in RAM.
@@ -174,6 +174,8 @@ def __init__(
174174
embedding: Embedding mode only.
175175
offload_kqv: Offload K, Q, V to GPU.
176176
flash_attn: Use flash attention.
177+
op_offloat: offload host tensor operations to device
178+
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
177179
no_perf: Measure performance timings.
178180
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
179181
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -226,11 +228,6 @@ def __init__(
226228
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
227229
self.model_params.split_mode = split_mode
228230
self.model_params.main_gpu = main_gpu
229-
if rpc_servers is not None:
230-
self.model_params.rpc_servers = rpc_servers.encode("utf-8")
231-
self._rpc_servers = rpc_servers
232-
else:
233-
self._rpc_servers = None
234231
self.tensor_split = tensor_split
235232
self._c_tensor_split = None
236233
if self.tensor_split is not None:
@@ -341,12 +338,17 @@ def __init__(
341338
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
342339
)
343340
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
344-
self.context_params.logits_all = (
345-
logits_all if draft_model is None else True
346-
) # Must be set to True for speculative decoding
341+
self._logits_all = logits_all if draft_model is None else True
347342
self.context_params.embeddings = embedding # TODO: Rename to embeddings
348343
self.context_params.offload_kqv = offload_kqv
349344
self.context_params.flash_attn = flash_attn
345+
346+
if op_offloat is not None:
347+
self.context_params.op_offloat = op_offloat
348+
349+
if swa_full is not None:
350+
self.context_params.swa_full = swa_full
351+
350352
# KV cache quantization
351353
if type_k is not None:
352354
self.context_params.type_k = type_k
@@ -568,7 +570,7 @@ def eval_tokens(self) -> Deque[int]:
568570
def eval_logits(self) -> Deque[List[float]]:
569571
return deque(
570572
self.scores[: self.n_tokens, :].tolist(),
571-
maxlen=self._n_ctx if self.context_params.logits_all else 1,
573+
maxlen=self._n_ctx if self._logits_all else 1,
572574
)
573575

574576
def tokenize(
@@ -641,13 +643,13 @@ def eval(self, tokens: Sequence[int]):
641643
n_past = self.n_tokens
642644
n_tokens = len(batch)
643645
self._batch.set_batch(
644-
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
646+
batch=batch, n_past=n_past, logits_all=self._logits_all
645647
)
646648
self._ctx.decode(self._batch)
647649
# Save tokens
648650
self.input_ids[n_past : n_past + n_tokens] = batch
649651
# Save logits
650-
if self.context_params.logits_all:
652+
if self._logits_all:
651653
rows = n_tokens
652654
cols = self._n_vocab
653655
logits = np.ctypeslib.as_array(
@@ -1288,7 +1290,7 @@ def logit_bias_processor(
12881290
else:
12891291
stop_sequences = []
12901292

1291-
if logprobs is not None and self.context_params.logits_all is False:
1293+
if logprobs is not None and self._logits_all is False:
12921294
raise ValueError(
12931295
"logprobs is not supported for models created with logits_all=False"
12941296
)
@@ -2091,10 +2093,12 @@ def __getstate__(self):
20912093
yarn_beta_fast=self.context_params.yarn_beta_fast,
20922094
yarn_beta_slow=self.context_params.yarn_beta_slow,
20932095
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
2094-
logits_all=self.context_params.logits_all,
2096+
logits_all=self._logits_all,
20952097
embedding=self.context_params.embeddings,
20962098
offload_kqv=self.context_params.offload_kqv,
20972099
flash_attn=self.context_params.flash_attn,
2100+
op_offloat=self.context_params.op_offloat,
2101+
swa_full=self.context_params.swa_full,
20982102
# Sampling Params
20992103
no_perf=self.context_params.no_perf,
21002104
last_n_tokens_size=self.last_n_tokens_size,

0 commit comments

Comments
 (0)