Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0d37ce5

Browse files
committed
feat: Update llama.cpp
1 parent ffcd4b2 commit 0d37ce5

File tree

3 files changed

+5
-145
lines changed

3 files changed

+5
-145
lines changed

llama_cpp/llama_cpp.py

+2-142
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ class CtypesRef(Generic[CtypesCData]):
111111

112112
F = TypeVar("F", bound=Callable[..., Any])
113113

114+
114115
def ctypes_function_for_shared_library(lib: ctypes.CDLL):
115116
def ctypes_function(
116117
name: str, argtypes: List[Any], restype: Any, enabled: bool = True
@@ -938,18 +939,6 @@ def llama_supports_gpu_offload() -> bool:
938939
...
939940

940941

941-
# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
942-
@ctypes_function("llama_mmap_supported", [], ctypes.c_bool)
943-
def llama_mmap_supported() -> bool:
944-
...
945-
946-
947-
# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
948-
@ctypes_function("llama_mlock_supported", [], ctypes.c_bool)
949-
def llama_mlock_supported() -> bool:
950-
...
951-
952-
953942
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
954943
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
955944
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
@@ -1158,47 +1147,6 @@ def llama_model_quantize(
11581147
...
11591148

11601149

1161-
# // Apply a LoRA adapter to a loaded model
1162-
# // path_base_model is the path to a higher quality model to use as a base for
1163-
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
1164-
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
1165-
# // will be applied on top of the previous one
1166-
# // Returns 0 on success
1167-
# LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
1168-
# struct llama_context * ctx,
1169-
# const char * path_lora,
1170-
# float scale,
1171-
# const char * path_base_model,
1172-
# int32_t n_threads),
1173-
# "use llama_model_apply_lora_from_file instead");
1174-
@ctypes_function(
1175-
"llama_apply_lora_from_file",
1176-
[
1177-
llama_context_p_ctypes,
1178-
ctypes.c_char_p,
1179-
ctypes.c_float,
1180-
ctypes.c_char_p,
1181-
ctypes.c_int32,
1182-
],
1183-
ctypes.c_int32,
1184-
)
1185-
def llama_apply_lora_from_file(
1186-
ctx: llama_context_p,
1187-
path_lora: Union[ctypes.c_char_p, bytes],
1188-
scale: Union[ctypes.c_float, float],
1189-
path_base_model: Union[ctypes.c_char_p, bytes],
1190-
n_threads: Union[ctypes.c_int32, int],
1191-
/,
1192-
) -> int:
1193-
"""Apply a LoRA adapter to a loaded model
1194-
path_base_model is the path to a higher quality model to use as a base for
1195-
the layers modified by the adapter. Can be NULL to use the current loaded model.
1196-
The model needs to be reloaded before applying a new adapter, otherwise the adapter
1197-
will be applied on top of the previous one
1198-
Returns 0 on success"""
1199-
...
1200-
1201-
12021150
# LLAMA_API int32_t llama_model_apply_lora_from_file(
12031151
# const struct llama_model * model,
12041152
# const char * path_lora,
@@ -1220,7 +1168,7 @@ def llama_model_apply_lora_from_file(
12201168
model: llama_model_p,
12211169
path_lora: Union[ctypes.c_char_p, bytes],
12221170
scale: Union[ctypes.c_float, float],
1223-
path_base_model: Union[ctypes.c_char_p, bytes],
1171+
path_base_model: Union[ctypes.c_char_p, bytes, None],
12241172
n_threads: Union[ctypes.c_int32, int],
12251173
/,
12261174
) -> int:
@@ -1647,72 +1595,6 @@ def llama_save_session_file(
16471595
# //
16481596

16491597

1650-
# // Run the llama inference to obtain the logits and probabilities for the next token(s).
1651-
# // tokens + n_tokens is the provided batch of new tokens to process
1652-
# // n_past is the number of tokens to use from previous eval calls
1653-
# // Returns 0 on success
1654-
# // DEPRECATED: use llama_decode() instead
1655-
# LLAMA_API DEPRECATED(int llama_eval(
1656-
# struct llama_context * ctx,
1657-
# llama_token * tokens,
1658-
# int32_t n_tokens,
1659-
# int32_t n_past),
1660-
# "use llama_decode() instead");
1661-
@ctypes_function(
1662-
"llama_eval",
1663-
[
1664-
llama_context_p_ctypes,
1665-
llama_token_p,
1666-
ctypes.c_int32,
1667-
ctypes.c_int32,
1668-
],
1669-
ctypes.c_int,
1670-
)
1671-
def llama_eval(
1672-
ctx: llama_context_p,
1673-
tokens: CtypesArray[llama_token],
1674-
n_tokens: Union[ctypes.c_int, int],
1675-
n_past: Union[ctypes.c_int, int],
1676-
/,
1677-
) -> int:
1678-
"""Run the llama inference to obtain the logits and probabilities for the next token(s).
1679-
tokens + n_tokens is the provided batch of new tokens to process
1680-
n_past is the number of tokens to use from previous eval calls
1681-
Returns 0 on success
1682-
DEPRECATED: use llama_decode() instead"""
1683-
...
1684-
1685-
1686-
# // Same as llama_eval, but use float matrix input directly.
1687-
# // DEPRECATED: use llama_decode() instead
1688-
# LLAMA_API DEPRECATED(int llama_eval_embd(
1689-
# struct llama_context * ctx,
1690-
# float * embd,
1691-
# int32_t n_tokens,
1692-
# int32_t n_past),
1693-
# "use llama_decode() instead");
1694-
@ctypes_function(
1695-
"llama_eval_embd",
1696-
[
1697-
llama_context_p_ctypes,
1698-
ctypes.POINTER(ctypes.c_float),
1699-
ctypes.c_int32,
1700-
ctypes.c_int32,
1701-
],
1702-
ctypes.c_int,
1703-
)
1704-
def llama_eval_embd(
1705-
ctx: llama_context_p,
1706-
embd: CtypesArray[ctypes.c_float],
1707-
n_tokens: Union[ctypes.c_int, int],
1708-
n_past: Union[ctypes.c_int, int],
1709-
/,
1710-
) -> int:
1711-
"""Same as llama_eval, but use float matrix input directly.
1712-
DEPRECATED: use llama_decode() instead"""
1713-
...
1714-
1715-
17161598
# // Return batch for single sequence of tokens starting at pos_0
17171599
# //
17181600
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@@ -2474,28 +2356,6 @@ def llama_sample_temp(
24742356
...
24752357

24762358

2477-
# LLAMA_API DEPRECATED(void llama_sample_temperature(
2478-
# struct llama_context * ctx,
2479-
# llama_token_data_array * candidates,
2480-
# float temp),
2481-
# "use llama_sample_temp instead");
2482-
@ctypes_function(
2483-
"llama_sample_temperature",
2484-
[llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float],
2485-
None,
2486-
)
2487-
def llama_sample_temperature(
2488-
ctx: llama_context_p,
2489-
candidates: Union[
2490-
CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
2491-
],
2492-
temp: Union[ctypes.c_float, float],
2493-
/,
2494-
):
2495-
"""use llama_sample_temp instead"""
2496-
...
2497-
2498-
24992359
# /// @details Apply constraints from grammar
25002360
# LLAMA_API void llama_sample_grammar(
25012361
# struct llama_context * ctx,

llama_cpp/server/settings.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ class ModelSettings(BaseSettings):
4545
default=False, description="Whether to only return the vocabulary."
4646
)
4747
use_mmap: bool = Field(
48-
default=llama_cpp.llama_mmap_supported(),
48+
default=llama_cpp.llama_supports_mmap(),
4949
description="Use mmap.",
5050
)
5151
use_mlock: bool = Field(
52-
default=llama_cpp.llama_mlock_supported(),
52+
default=llama_cpp.llama_supports_mlock(),
5353
description="Use mlock.",
5454
)
5555
kv_overrides: Optional[List[str]] = Field(

vendor/llama.cpp

0 commit comments

Comments
 (0)