Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b99e758

Browse files
authored
Merge pull request abetlen#604 from aliencaocao/main-1
Add doc string for n_gpu_layers argument and make -1 offload all layers
2 parents b345d60 + c471871 commit b99e758

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

llama_cpp/llama.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def __init__(
239239
n_ctx: Maximum context size.
240240
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
241241
seed: Random seed. -1 for random.
242+
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
242243
f16_kv: Use half-precision for key/value cache.
243244
logits_all: Return logits for all tokens, not just the last token.
244245
vocab_only: Only load the vocabulary no weights.
@@ -267,7 +268,7 @@ def __init__(
267268

268269
self.params = llama_cpp.llama_context_default_params()
269270
self.params.n_ctx = n_ctx
270-
self.params.n_gpu_layers = n_gpu_layers
271+
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
271272
self.params.seed = seed
272273
self.params.f16_kv = f16_kv
273274
self.params.logits_all = logits_all

0 commit comments

Comments
 (0)