@@ -615,6 +615,9 @@ class llama_model_kv_override(ctypes.Structure):
615
615
616
616
617
617
# struct llama_model_params {
618
+ # // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
619
+ # ggml_backend_dev_t * devices;
620
+
618
621
# int32_t n_gpu_layers; // number of layers to store in VRAM
619
622
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
620
623
@@ -680,6 +683,7 @@ class llama_model_params(ctypes.Structure):
680
683
check_tensors : bool
681
684
682
685
_fields_ = [
686
+ ("devices" , ctypes .c_void_p ), # NOTE: unnused
683
687
("n_gpu_layers" , ctypes .c_int32 ),
684
688
("split_mode" , ctypes .c_int ),
685
689
("main_gpu" , ctypes .c_int32 ),
@@ -1898,6 +1902,14 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
1898
1902
...
1899
1903
1900
1904
1905
+ # // Check if the context supports KV cache shifting
1906
+ # LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
1907
+ @ctypes_function ("llama_kv_cache_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
1908
+ def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
1909
+ """Check if the context supports KV cache shifting"""
1910
+ ...
1911
+
1912
+
1901
1913
# //
1902
1914
# // State / sessions
1903
1915
# //
@@ -3621,13 +3633,3 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
3621
3633
...
3622
3634
3623
3635
3624
- # LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
3625
- @ctypes_function (
3626
- "llama_perf_dump_yaml" ,
3627
- [ctypes .POINTER (ctypes .c_void_p ), llama_context_p_ctypes ],
3628
- None ,
3629
- )
3630
- def llama_perf_dump_yaml (
3631
- stream : ctypes .POINTER (ctypes .c_void_p ), ctx : llama_context_p , /
3632
- ):
3633
- ...
0 commit comments