Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9769e57

Browse files
committed
feat: Update llama.cpp
1 parent dcb0d0c commit 9769e57

File tree

3 files changed

+5
-5
lines changed

3 files changed

+5
-5
lines changed

llama_cpp/llama.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def __init__(
153153
model_path: Path to the model.
154154
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
155155
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
156-
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
156+
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
157157
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
158158
rpc_servers: Comma separated list of RPC servers to use for offloading
159159
vocab_only: Only load the vocabulary no weights.

llama_cpp/llama_cpp.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -711,9 +711,9 @@ class llama_model_kv_override(ctypes.Structure):
711711
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
712712

713713
# // main_gpu interpretation depends on split_mode:
714-
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
715-
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
716-
# // LLAMA_SPLIT_LAYER: ignored
714+
# // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
715+
# // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
716+
# // LLAMA_SPLIT_MODE_LAYER: ignored
717717
# int32_t main_gpu;
718718

719719
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()

vendor/llama.cpp

0 commit comments

Comments
 (0)