diff --git a/CHANGELOG.md b/CHANGELOG.md index 53365e368..e08e52c10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.10] + +- feat: Update llama.cpp to ggerganov/llama.cpp@8846aace4934ad29651ea61b8c7e3f6b0556e3d2 +- feat: Add support for llama.cpp multimodal, add Qwen2.5-VL chat handler by @abetlen in cd548bd0f14210627798237d5c2ea78acfb88ccb + +## [0.3.9] + +- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c + +## [0.3.8] + +- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698 + ## [0.3.7] - feat: Update llama.cpp to ggerganov/llama.cpp@794fe23f29fb40104975c91fe19f23798f7c726e diff --git a/CMakeLists.txt b/CMakeLists.txt index 64a0304a1..4b06d98b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,9 @@ if (LLAMA_BUILD) # Enable building of the common library set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE) + # Disable building curl support + set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) + # Architecture detection and settings for Apple platforms if (APPLE) # Get the target architecture @@ -93,7 +96,15 @@ if (LLAMA_BUILD) set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE) endif() + add_subdirectory(vendor/llama.cpp) + + if (WIN32) + if (TARGET llama) + set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() + endif() + llama_cpp_python_install_target(llama) llama_cpp_python_install_target(ggml) @@ -143,35 +154,34 @@ if (LLAMA_BUILD) endif() # Building llava - add_subdirectory(vendor/llama.cpp/examples/llava) - set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") + add_subdirectory(vendor/llama.cpp/tools/mtmd) if (WIN32) - set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF) + set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF) endif() - llama_cpp_python_install_target(llava_shared) + llama_cpp_python_install_target(mtmd) if (WIN32) install( - FILES $ + FILES $ DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib ) install( - FILES $ + FILES $ DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib ) endif() - # Fix for llava build: Add include directory for llama.h + # Fix for mtmd build: Add include directory for llama.h # Move these commands after the add_subdirectory call - target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) - target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) + target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) + target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) if (BUILD_SHARED_LIBS) - target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) - target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) + target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) + target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) endif() - target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) - target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) + # target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) + # target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) endif() endif() diff --git a/README.md b/README.md index e00456580..088a23779 100644 --- a/README.md +++ b/README.md @@ -505,6 +505,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` | | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` | +| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index fc1fcbcf6..11a511390 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.7" +__version__ = "0.3.10" diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 343581dce..18d733481 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -9,6 +9,8 @@ Tuple, Optional, Sequence, + Callable, + Union, ) from dataclasses import dataclass, field from contextlib import ExitStack @@ -48,7 +50,7 @@ def __init__( raise ValueError(f"Model path does not exist: {path_model}") with suppress_stdout_stderr(disable=verbose): - model = llama_cpp.llama_load_model_from_file( + model = llama_cpp.llama_model_load_from_file( self.path_model.encode("utf-8"), self.params ) @@ -62,32 +64,38 @@ def __init__( self.model = model self.vocab = vocab + self.sampler = None # LlamaModel doesn't use samplers, but some cleanup code expects this attribute def free_model(): if self.model is None: return - llama_cpp.llama_free_model(self.model) + llama_cpp.llama_model_free(self.model) self.model = None self._exit_stack.callback(free_model) def close(self): + if self.sampler is not None: + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + for i, _ in reversed(self.custom_samplers): + llama_cpp.llama_sampler_chain_remove(self.sampler, i) + self.custom_samplers.clear() self._exit_stack.close() def __del__(self): self.close() def vocab_type(self) -> int: - return llama_cpp.llama_vocab_type(self.model) + return llama_cpp.llama_vocab_type(self.vocab) def n_vocab(self) -> int: - return llama_cpp.llama_n_vocab(self.vocab) + return llama_cpp.llama_vocab_n_tokens(self.vocab) def n_ctx_train(self) -> int: - return llama_cpp.llama_n_ctx_train(self.model) + return llama_cpp.llama_model_n_ctx_train(self.model) def n_embd(self) -> int: - return llama_cpp.llama_n_embd(self.model) + return llama_cpp.llama_model_n_embd(self.model) def rope_freq_scale_train(self) -> float: return llama_cpp.llama_model_rope_freq_scale_train(self.model) @@ -109,48 +117,48 @@ def get_tensor(self, name: str) -> ctypes.c_void_p: # Vocab def token_get_text(self, token: int) -> str: - return llama_cpp.llama_token_get_text(self.vocab, token).decode("utf-8") + return llama_cpp.llama_vocab_get_text(self.vocab, token).decode("utf-8") def token_get_score(self, token: int) -> float: - return llama_cpp.llama_token_get_score(self.vocab, token) + return llama_cpp.llama_vocab_get_score(self.vocab, token) def token_get_attr(self, token: int) -> int: - return llama_cpp.llama_token_get_attr(self.vocab, token) + return llama_cpp.llama_vocab_get_attr(self.vocab, token) # Special tokens def token_bos(self) -> int: - return llama_cpp.llama_token_bos(self.vocab) + return llama_cpp.llama_vocab_bos(self.vocab) def token_eos(self) -> int: - return llama_cpp.llama_token_eos(self.vocab) + return llama_cpp.llama_vocab_eos(self.vocab) def token_cls(self) -> int: - return llama_cpp.llama_token_cls(self.vocab) + return llama_cpp.llama_vocab_cls(self.vocab) def token_sep(self) -> int: - return llama_cpp.llama_token_sep(self.vocab) + return llama_cpp.llama_vocab_sep(self.vocab) def token_nl(self) -> int: - return llama_cpp.llama_token_nl(self.vocab) + return llama_cpp.llama_vocab_nl(self.vocab) def token_prefix(self) -> int: - raise NotImplementedError("token_prefix is not implemented in llama.cpp") + return llama_cpp.llama_vocab_fim_pre(self.vocab) def token_middle(self) -> int: - raise NotImplementedError("token_middle is not implemented in llama.cpp") + return llama_cpp.llama_vocab_fim_mid(self.vocab) def token_suffix(self) -> int: - raise NotImplementedError("token_suffix is not implemented in llama.cpp") + return llama_cpp.llama_vocab_fim_suf(self.vocab) def token_eot(self) -> int: - return llama_cpp.llama_token_eot(self.vocab) + return llama_cpp.llama_vocab_eot(self.vocab) def add_bos_token(self) -> bool: - return llama_cpp.llama_add_bos_token(self.vocab) + return llama_cpp.llama_vocab_get_add_bos(self.vocab) def add_eos_token(self) -> bool: - return llama_cpp.llama_add_eos_token(self.vocab) + return llama_cpp.llama_vocab_get_add_eos(self.vocab) # Tokenization @@ -249,12 +257,14 @@ def __init__( self.verbose = verbose self._exit_stack = ExitStack() - ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) + ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) if ctx is None: raise ValueError("Failed to create llama_context") self.ctx = ctx + self.memory = llama_cpp.llama_get_memory(self.ctx) + self.sampler = None # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute def free_ctx(): if self.ctx is None: @@ -277,22 +287,22 @@ def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): - llama_cpp.llama_kv_cache_clear(self.ctx) + llama_cpp.llama_memory_clear(self.memory, True) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1) def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1) def kv_cache_seq_keep(self, seq_id: int): - llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) + llama_cpp.llama_memory_seq_keep(self.memory, seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): - llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) + llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift) def get_state_size(self) -> int: - return llama_cpp.llama_get_state_size(self.ctx) + return llama_cpp.llama_state_get_size(self.ctx) # TODO: copy_state_data @@ -310,6 +320,14 @@ def decode(self, batch: LlamaBatch): if return_code != 0: raise RuntimeError(f"llama_decode returned {return_code}") + def encode(self, batch: LlamaBatch): + return_code = llama_cpp.llama_encode( + self.ctx, + batch.batch, + ) + if return_code != 0: + raise RuntimeError(f"llama_encode returned {return_code}") + def set_n_threads(self, n_threads: int, n_threads_batch: int): llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) @@ -322,12 +340,16 @@ def get_logits_ith(self, i: int): def get_embeddings(self): return llama_cpp.llama_get_embeddings(self.ctx) - # Sampling functions + def get_embeddings_ith(self, i: int): + return llama_cpp.llama_get_embeddings_ith(self.ctx, i) + + def get_embeddings_seq(self, seq_id: int): + return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id) + + # Sampling functions - deprecated, use LlamaSampler instead def set_rng_seed(self, seed: int): - # TODO: Fix - # llama_cpp.llama_set_rng_seed(self.ctx, seed) - raise NotImplementedError("set_rng_seed is not implemented in llama.cpp") + raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead") def sample_repetition_penalties( self, @@ -338,63 +360,30 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - # llama_cpp.llama_sample_repetition_penalties( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # last_tokens_data, - # penalty_last_n, - # penalty_repeat, - # penalty_freq, - # penalty_present, - # ) - raise NotImplementedError("sample_repetition_penalties is not implemented in llama.cpp") + raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead") def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - # llama_cpp.llama_sample_softmax( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # ) - raise NotImplementedError("sample_softmax is not implemented in llama.cpp") + raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead") def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - # llama_cpp.llama_sample_top_k( - # self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep - # ) - raise NotImplementedError("sample_top_k is not implemented in llama.cpp") + raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead") def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - # llama_cpp.llama_sample_top_p( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_top_p is not implemented in llama.cpp") + raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead") def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - # llama_cpp.llama_sample_min_p( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_min_p is not implemented in llama.cpp") + raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead") def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - # llama_cpp.llama_sample_typical( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_typical is not implemented in llama.cpp") + raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead") def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): - # llama_cpp.llama_sample_temp( - # self.ctx, llama_cpp.byref(candidates.candidates), temp - # ) - raise NotImplementedError("sample_temp is not implemented in llama.cpp") + raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead") def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - # llama_cpp.llama_sample_grammar( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # grammar.grammar, - # ) - raise NotImplementedError("sample_grammar is not implemented in llama.cpp") + raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead") def sample_token_mirostat( self, @@ -404,15 +393,7 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_mirostat( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # tau, - # eta, - # m, - # mu, - # ) + raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead") def sample_token_mirostat_v2( self, @@ -421,33 +402,17 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_mirostat_v2( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # tau, - # eta, - # mu, - # ) + raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_greedy( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # ) + raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead") def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # ) + raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead") # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is not implemented in llama.cpp") - # llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token) + raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead") def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -478,6 +443,7 @@ def __init__( raise ValueError("Failed to create llama_batch") self.batch = batch + self.sampler = None # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute def free_batch(): if self.batch is None: @@ -540,6 +506,7 @@ def __init__(self, *, n_vocab: int): ) self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) + self.sampler = None # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute def copy_logits(self, logits: npt.NDArray[np.single]): self.candidates_data.id[:] = self.default_candidates_data_id @@ -628,103 +595,16 @@ def sample( idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None, ): - n_vocab = ctx_main.model.n_vocab() - id: int = 0 - - if logits_array is None: - logits = ctx_main.get_logits_ith(idx) - logits_array = np.array( - ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents, - dtype=np.single, - ) - - # apply logit_bias - for token, logit_bias in self.params.logit_bias.items(): - logits_array[token] += logit_bias - - token_data_array = LlamaTokenDataArray( - n_vocab=n_vocab - ) # TODO: Only create this once - token_data_array.copy_logits(logits_array) - - # apply penalties - if len(self.prev) > 0: - nl_token = ctx_main.model.token_nl() - nl_logit = logits_array[nl_token] - last_tokens = self.prev[-self.params.penalty_last_n :] - last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) - if last_tokens_size > 0: - last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens) - ctx_main.sample_repetition_penalties( - token_data_array, - last_tokens_p, - last_tokens_size, - self.params.penalty_repeat, - self.params.penalty_freq, - self.params.penalty_present, - ) - if not self.params.penalize_nl: - token_data_array.candidates_data.logit[nl_token] = nl_logit - - if self.grammar is not None: - ctx_main.sample_grammar(token_data_array, self.grammar) - - if self.params.temp < 0: - ctx_main.sample_softmax(token_data_array) - id = token_data_array.candidates_data.id[0] - elif self.params.temp == 0: - id = ctx_main.sample_token_greedy(token_data_array) - else: - if self.params.mirostat == 1: - mirostat_m = 100 - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - mirostat_m, - ctypes.pointer(self.mirostat_mu), - ) - elif self.params.mirostat == 2: - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat_v2( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - ctypes.pointer(self.mirostat_mu), - ) - else: - min_keep = max(1, self.params.n_probs) - ctx_main.sample_top_k( - token_data_array, self.params.top_k, min_keep=min_keep - ) - ctx_main.sample_typical( - token_data_array, self.params.typical_p, min_keep=min_keep - ) - ctx_main.sample_top_p( - token_data_array, self.params.top_p, min_keep=min_keep - ) - ctx_main.sample_min_p( - token_data_array, self.params.min_p, min_keep=min_keep - ) - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token(token_data_array) - return id + # This method is deprecated in favor of using LlamaSampler directly + raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): - if apply_grammar and self.grammar is not None: - ctx_main.grammar_accept_token(self.grammar, id) self.prev.append(id) -from typing import List, Callable, Optional, Union -import ctypes -import llama_cpp - - class CustomSampler: def __init__( - self, apply_func: typing.Callable[[llama_cpp.llama_token_data_array], None] + self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] ): self.apply_func = apply_func @@ -757,72 +637,117 @@ def get_sampler(self) -> llama_cpp.llama_sampler_p: class LlamaSampler: def __init__(self): - params = llama_cpp.llama_sampler_chain_params() + params = llama_cpp.llama_sampler_chain_default_params() self.sampler = llama_cpp.llama_sampler_chain_init(params) - self.samplers: List[llama_cpp.llama_sampler_p] = [] self.custom_samplers: List[Tuple[int, CustomSampler]] = [] + self._exit_stack = ExitStack() + + def free_sampler(): + if self.sampler is not None: + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + for i, _ in reversed(self.custom_samplers): + llama_cpp.llama_sampler_chain_remove(self.sampler, i) + llama_cpp.llama_sampler_free(self.sampler) + self.sampler = None + + self._exit_stack.callback(free_sampler) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() def add_greedy(self): sampler = llama_cpp.llama_sampler_init_greedy() - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_dist(self, seed: int): sampler = llama_cpp.llama_sampler_init_dist(seed) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_softmax(self): sampler = llama_cpp.llama_sampler_init_softmax() - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_top_k(self, k: int): sampler = llama_cpp.llama_sampler_init_top_k(k) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_top_p(self, p: float, min_keep: int): + def add_top_p(self, p: float, min_keep: int = 1): sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_min_p(self, p: float, min_keep: int): + def add_min_p(self, p: float, min_keep: int = 1): sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_typical(self, p: float, min_keep: int): + def add_typical(self, p: float, min_keep: int = 1): sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_temp(self, temp: float): sampler = llama_cpp.llama_sampler_init_temp(temp) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_temp_ext(self, t: float, delta: float, exponent: float): sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + + def add_xtc(self, p: float, t: float, min_keep: int, seed: int): + sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + + def add_top_n_sigma(self, n: float): + sampler = llama_cpp.llama_sampler_init_top_n_sigma(n) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_mirostat_v2(self, seed: int, tau: float, eta: float): sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): sampler = llama_cpp.llama_sampler_init_grammar( model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") ) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + + def add_grammar_lazy_patterns( + self, + model: LlamaModel, + grammar: LlamaGrammar, + trigger_patterns: List[str], + trigger_tokens: List[int] + ): + # Convert patterns to C array + pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))() + for i, pattern in enumerate(trigger_patterns): + pattern_ptrs[i] = pattern.encode("utf-8") + + # Convert tokens to C array + token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) + + sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( + model.vocab, + grammar._grammar.encode("utf-8"), + grammar._root.encode("utf-8"), + pattern_ptrs, + len(trigger_patterns), + token_array, + len(trigger_tokens) + ) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_penalties( self, - n_vocab: int, - special_eos_id: int, - linefeed_id: int, penalty_last_n: int, penalty_repeat: float, penalty_freq: float, penalty_present: float, - penalize_nl: bool, - ignore_eos: bool, ): sampler = llama_cpp.llama_sampler_init_penalties( penalty_last_n, @@ -830,50 +755,96 @@ def add_penalties( penalty_freq, penalty_present, ) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def init_logit_bias( - self, n_vocab: int, n_logit_bias, logit_bias: llama_cpp.llama_logit_bias_p + def add_dry( + self, + model: LlamaModel, + n_ctx_train: int, + dry_multiplier: float, + dry_base: float, + dry_allowed_length: int, + dry_penalty_last_n: int, + seq_breakers: List[str] + ): + # Convert seq_breakers to C array + breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))() + for i, breaker in enumerate(seq_breakers): + breaker_ptrs[i] = breaker.encode("utf-8") + + sampler = llama_cpp.llama_sampler_init_dry( + model.vocab, + n_ctx_train, + dry_multiplier, + dry_base, + dry_allowed_length, + dry_penalty_last_n, + breaker_ptrs, + len(seq_breakers) + ) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + + def add_logit_bias( + self, + n_vocab: int, + logit_bias: Dict[int, float] ): + # Convert logit_bias dict to C array + bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))() + for i, (token, bias) in enumerate(logit_bias.items()): + bias_array[i].token = token + bias_array[i].bias = bias + sampler = llama_cpp.llama_sampler_init_logit_bias( - n_vocab, n_logit_bias, logit_bias + n_vocab, + len(logit_bias), + bias_array ) - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + + def add_infill(self, model: LlamaModel): + sampler = llama_cpp.llama_sampler_init_infill(model.vocab) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_custom( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] ): custom_sampler = CustomSampler(apply_func) sampler = custom_sampler.get_sampler() - self._add_sampler(sampler) + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) # NOTE: Must remove custom samplers before free or llama.cpp will try to free them self.custom_samplers.append( (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler) ) - def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): - assert self.sampler is not None - llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - self.samplers.append(sampler) - def get_seed(self) -> int: - assert self.sampler is not None return llama_cpp.llama_sampler_get_seed(self.sampler) - def sample(self, ctx: LlamaContext, idx: int) -> int: - assert self.sampler is not None - assert ctx.ctx is not None + def sample(self, ctx: LlamaContext, idx: int = -1) -> int: return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx) - def close(self): - if self.sampler: - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them - for i, _ in reversed(self.custom_samplers): - llama_cpp.llama_sampler_chain_remove(self.sampler, i) - llama_cpp.llama_sampler_free(self.sampler) - self.sampler = None - self.samplers.clear() - self.custom_samplers.clear() + def accept(self, token: int): + llama_cpp.llama_sampler_accept(self.sampler, token) - def __del__(self): - self.close() + def reset(self): + llama_cpp.llama_sampler_reset(self.sampler) + + def clone(self): + # NOTE: Custom samplers cannot be cloned due to Python callback limitations + if self.custom_samplers: + raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers") + + cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler) + # Create a new wrapper around the cloned sampler + new_sampler = LlamaSampler.__new__(LlamaSampler) + new_sampler.sampler = cloned_sampler + new_sampler.custom_samplers = [] + new_sampler._exit_stack = ExitStack() + + def free_sampler(): + if new_sampler.sampler is not None: + llama_cpp.llama_sampler_free(new_sampler.sampler) + new_sampler.sampler = None + + new_sampler._exit_stack.callback(free_sampler) + return new_sampler diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7e9a6af23..cdc05c7ad 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -66,7 +66,6 @@ def __init__( split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, - rpc_servers: Optional[str] = None, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, @@ -93,6 +92,8 @@ def __init__( embedding: bool = False, offload_kqv: bool = True, flash_attn: bool = False, + op_offloat: Optional[bool] = None, + swa_full: Optional[bool] = None, # Sampling Params no_perf: bool = False, last_n_tokens_size: int = 64, @@ -150,7 +151,6 @@ def __init__( split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. - rpc_servers: Comma separated list of RPC servers to use for offloading vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. @@ -174,6 +174,8 @@ def __init__( embedding: Embedding mode only. offload_kqv: Offload K, Q, V to GPU. flash_attn: Use flash attention. + op_offloat: offload host tensor operations to device + swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) no_perf: Measure performance timings. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. @@ -226,11 +228,6 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu - if rpc_servers is not None: - self.model_params.rpc_servers = rpc_servers.encode("utf-8") - self._rpc_servers = rpc_servers - else: - self._rpc_servers = None self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: @@ -341,12 +338,17 @@ def __init__( yarn_beta_slow if yarn_beta_slow != 0.0 else 0 ) self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 - self.context_params.logits_all = ( - logits_all if draft_model is None else True - ) # Must be set to True for speculative decoding + self._logits_all = logits_all if draft_model is None else True self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv self.context_params.flash_attn = flash_attn + + if op_offloat is not None: + self.context_params.op_offloat = op_offloat + + if swa_full is not None: + self.context_params.swa_full = swa_full + # KV cache quantization if type_k is not None: self.context_params.type_k = type_k @@ -568,7 +570,7 @@ def eval_tokens(self) -> Deque[int]: def eval_logits(self) -> Deque[List[float]]: return deque( self.scores[: self.n_tokens, :].tolist(), - maxlen=self._n_ctx if self.context_params.logits_all else 1, + maxlen=self._n_ctx if self._logits_all else 1, ) def tokenize( @@ -641,13 +643,13 @@ def eval(self, tokens: Sequence[int]): n_past = self.n_tokens n_tokens = len(batch) self._batch.set_batch( - batch=batch, n_past=n_past, logits_all=self.context_params.logits_all + batch=batch, n_past=n_past, logits_all=self._logits_all ) self._ctx.decode(self._batch) # Save tokens self.input_ids[n_past : n_past + n_tokens] = batch # Save logits - if self.context_params.logits_all: + if self._logits_all: rows = n_tokens cols = self._n_vocab logits = np.ctypeslib.as_array( @@ -709,15 +711,15 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): sampler.add_custom(apply_func) sampler.add_penalties( - n_vocab=self._n_vocab, - special_eos_id=self._token_eos, - linefeed_id=self._token_nl, + # n_vocab=self._n_vocab, + # special_eos_id=self._token_eos, + # linefeed_id=self._token_nl, penalty_last_n=self.last_n_tokens_size, penalty_repeat=repeat_penalty, penalty_freq=frequency_penalty, penalty_present=presence_penalty, - penalize_nl=penalize_nl, - ignore_eos=False, + # penalize_nl=penalize_nl, + # ignore_eos=False, ) if grammar is not None: @@ -1288,7 +1290,7 @@ def logit_bias_processor( else: stop_sequences = [] - if logprobs is not None and self.context_params.logits_all is False: + if logprobs is not None and self._logits_all is False: raise ValueError( "logprobs is not supported for models created with logits_all=False" ) @@ -2091,10 +2093,12 @@ def __getstate__(self): yarn_beta_fast=self.context_params.yarn_beta_fast, yarn_beta_slow=self.context_params.yarn_beta_slow, yarn_orig_ctx=self.context_params.yarn_orig_ctx, - logits_all=self.context_params.logits_all, + logits_all=self._logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, flash_attn=self.context_params.flash_attn, + op_offloat=self.context_params.op_offloat, + swa_full=self.context_params.swa_full, # Sampling Params no_perf=self.context_params.no_perf, last_n_tokens_size=self.last_n_tokens_size, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 17575c700..a288db7b0 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -28,6 +28,7 @@ import numpy as np import numpy.typing as npt +import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama as llama import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar @@ -2651,7 +2652,7 @@ def generate_streaming(tools, functions, function_call, prompt): class Llava15ChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( - "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." ) CHAT_FORMAT = ( @@ -2690,70 +2691,72 @@ class Llava15ChatHandler: ) def __init__(self, clip_model_path: str, verbose: bool = True): - import llama_cpp.llava_cpp as llava_cpp + import llama_cpp.mtmd_cpp as mtmd_cpp self.clip_model_path = clip_model_path self.verbose = verbose - - self._llava_cpp = llava_cpp # TODO: Fix + self._mtmd_cpp = mtmd_cpp self._exit_stack = ExitStack() - self._last_image_embed: Optional[ - llava_cpp.CtypesPointer[llava_cpp.llava_image_embed] - ] = None - self._last_image_hash: Optional[int] = None + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None if not os.path.exists(clip_model_path): raise ValueError(f"Clip model path does not exist: {clip_model_path}") + def _init_mtmd_context(self, llama_model: llama.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + with suppress_stdout_stderr(disable=self.verbose): - clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0) + # Get default parameters + ctx_params = self._mtmd_cpp.mtmd_context_params_default() + ctx_params.use_gpu = True # TODO: Make this configurable + ctx_params.print_timings = self.verbose + ctx_params.n_threads = llama_model.n_threads + ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2 + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.clip_model_path.encode(), + llama_model.model, + ctx_params + ) - if clip_ctx is None: - raise ValueError(f"Failed to load clip model: {clip_model_path}") + if self.mtmd_ctx is None: + raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}") - self.clip_ctx = clip_ctx + # Check if vision is supported + if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): + raise ValueError("Vision is not supported by this model") - def clip_free(): + def mtmd_free(): with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.clip_free(self.clip_ctx) - - self._exit_stack.callback(clip_free) + if self.mtmd_ctx is not None: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + self.mtmd_ctx = None - def last_image_embed_free(): - with suppress_stdout_stderr(disable=self.verbose): - if self._last_image_embed is not None: - self._llava_cpp.llava_image_embed_free(self._last_image_embed) - self._last_image_embed = None - - self._exit_stack.callback(last_image_embed_free) + self._exit_stack.callback(mtmd_free) def load_image(self, image_url: str) -> bytes: return self._load_image(image_url) - def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1): - if ( - self._last_image_embed is not None - and self._last_image_hash is not None - and hash(image_bytes) == self._last_image_hash - ): - return self._last_image_embed + def _create_bitmap_from_bytes(self, image_bytes: bytes): + """Create mtmd_bitmap from image bytes.""" + if self.mtmd_ctx is None: + raise ValueError("mtmd context not initialized") + with suppress_stdout_stderr(disable=self.verbose): - # Free the previous image embed - if self._last_image_embed is not None: - self._llava_cpp.llava_image_embed_free(self._last_image_embed) - self._last_image_embed = None - self._last_image_hash = None - embed = self._llava_cpp.llava_image_embed_make_with_bytes( - self.clip_ctx, - n_threads_batch, - (ctypes.c_uint8 * len(image_bytes)).from_buffer( - bytearray(image_bytes) - ), - len(image_bytes), + # Create bitmap from buffer using helper function + bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), + len(image_bytes) ) - self._last_image_embed = embed - self._last_image_hash = hash(image_bytes) - return embed + + if bitmap is None: + raise ValueError("Failed to create bitmap from image bytes") + + return bitmap def __call__( self, @@ -2794,7 +2797,9 @@ def __call__( llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: - assert self.clip_ctx is not None + # Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None system_prompt = _get_system_message(messages) if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: @@ -2809,54 +2814,131 @@ def __call__( trim_blocks=True, lstrip_blocks=True, ).from_string(self.CHAT_FORMAT) + + # Get the default media marker + media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Replace image URLs with media markers in the template text = template.render( messages=messages, add_generation_prompt=True, eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) - split_text = self.split_text_on_image_urls(text, image_urls) + + # Replace image URLs in text with media markers + for image_url in image_urls: + text = text.replace(image_url, media_marker) if self.verbose: print(text, file=sys.stderr) + # Create bitmaps from images + bitmaps = [] + bitmap_cleanup = [] + try: + for image_url in image_urls: + image_bytes = self.load_image(image_url) + bitmap = self._create_bitmap_from_bytes(image_bytes) + bitmaps.append(bitmap) + bitmap_cleanup.append(bitmap) + + # Create input text structure + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = True + input_text.parse_special = True + + # Create input chunks + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError("Failed to create input chunks") - # Evaluate prompt - llama.reset() - llama._ctx.kv_cache_clear() - for type_, value in split_text: - if type_ == "text": - tokens = llama.tokenize( - value.encode("utf8"), add_bos=False, special=True + try: + # Tokenize text and images together + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, + chunks, + ctypes.byref(input_text), + bitmap_array, + len(bitmaps) ) - if llama.n_tokens + len(tokens) > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" - ) - llama.eval(tokens) - else: - image_bytes = self.load_image(value) - embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) - if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" - ) - n_past = ctypes.c_int(llama.n_tokens) - n_past_p = ctypes.pointer(n_past) - with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.llava_eval_image_embed( - llama.ctx, - embed, - llama.n_batch, - n_past_p, - ) - # Required to avoid issues with hf tokenizer - llama.input_ids[llama.n_tokens : n_past.value] = -1 - llama.n_tokens = n_past.value - # Get prompt tokens to avoid a cache miss - prompt = llama.input_ids[: llama.n_tokens].tolist() + if result != 0: + raise ValueError(f"Failed to tokenize input: error code {result}") + + # Reset llama context + llama.reset() + llama._ctx.kv_cache_clear() + + # Process each chunk + n_past = llama_cpp.llama_pos(0) + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: + continue + + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT: + # Handle text chunk + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( + chunk, ctypes.byref(n_tokens_out) + ) + + if tokens_ptr and n_tokens_out.value > 0: + # Convert ctypes array to Python list + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + + if llama.n_tokens + len(tokens) > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" + ) + llama.eval(tokens) + + elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]: + # Handle image/audio chunk using helper + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" + ) + + new_n_past = llama_cpp.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk, + llama_cpp.llama_pos(llama.n_tokens), + llama_cpp.llama_seq_id(0), + llama.n_batch, + False, # logits_last + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"Failed to evaluate chunk: error code {result}") + + # Update llama's token count + llama.n_tokens = new_n_past.value + + # Get prompt tokens to avoid a cache miss + prompt = llama.input_ids[: llama.n_tokens].tolist() + finally: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + + finally: + # Cleanup bitmaps + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + + # Handle response format and tools (same as before) if response_format is not None and response_format["type"] == "json_object": grammar = _grammar_for_response_format(response_format) @@ -2931,6 +3013,7 @@ def __call__( grammar=grammar, logit_bias=logit_bias, ) + if tool is not None: tool_name = tool["function"]["name"] return _convert_completion_to_chat_function( @@ -2943,12 +3026,10 @@ def _load_image(image_url: str) -> bytes: # TODO: Add Pillow support for other image formats beyond (jpg, png) if image_url.startswith("data:"): import base64 - image_bytes = base64.b64decode(image_url.split(",")[1]) return image_bytes else: import urllib.request - with urllib.request.urlopen(image_url) as f: image_bytes = f.read() return image_bytes @@ -2974,6 +3055,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): @staticmethod def split_text_on_image_urls(text: str, image_urls: List[str]): + """This method is no longer used in the new implementation.""" def find_first(s: str, substrs: List[str]): for i, substr in enumerate(substrs): pos = s.find(substr) @@ -3373,6 +3455,61 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) +class Qwen25VLChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." + + CHAT_FORMAT = ( + "<|im_start|>system\n" + "You are a helpful assistant.<|im_end|>\n" + "{% for message in messages %}" + "{% if message['role'] == 'user' %}" + "<|im_start|>user\n" + "{% if message['content'] is string %}" + "{{ message['content'] }}" + "{% else %}" + "{% for content in message['content'] %}" + "{% if content['type'] == 'text' %}" + "{{ content['text'] }}" + "{% elif content['type'] == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% else %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + "{% endfor %}" + "<|im_start|>assistant\n" + ) + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + # Clear state for multiple runs + llama.reset() + llama._ctx.kv_cache_clear() + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + # Clear any handler state + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + image_count = len(self.get_image_urls(messages)) + print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr) + + # Use parent implementation + return super().__call__(**kwargs) + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 205d89a0b..d13d60458 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -161,9 +161,13 @@ llama_context_p = NewType("llama_context_p", int) llama_context_p_ctypes = ctypes.c_void_p -# # struct llama_sampler; -# llama_sampler_p = NewType("llama_sampler_p", int) -# llama_sampler_p_ctypes = ctypes.c_void_p +# typedef struct llama_memory_i * llama_memory_t; +llama_memory_t = NewType("llama_memory_t", int) +llama_memory_t_ctypes = ctypes.c_void_p + +# struct llama_kv_cache; (DEPRECATED) +llama_kv_cache_p = NewType("llama_kv_cache_p", int) +llama_kv_cache_p_ctypes = ctypes.c_void_p # typedef int32_t llama_pos; llama_pos = ctypes.c_int32 @@ -227,6 +231,13 @@ # LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, # LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, # LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, +# LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, +# LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, +# LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, +# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, +# LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, +# LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, +# LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -244,7 +255,7 @@ LLAMA_VOCAB_PRE_TYPE_DBRX = 13 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 LLAMA_VOCAB_PRE_TYPE_PORO = 15 -LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16 +LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 LLAMA_VOCAB_PRE_TYPE_VIKING = 18 LLAMA_VOCAB_PRE_TYPE_JAIS = 19 @@ -257,6 +268,13 @@ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 +LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 +LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 +LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 +LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 +LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 +LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 +LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 # // note: these values should be synchronized with ggml_rope @@ -405,14 +423,14 @@ # LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, # LLAMA_ROPE_SCALING_TYPE_YARN = 2, # LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3, -# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, +# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE, # }; LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1 LLAMA_ROPE_SCALING_TYPE_NONE = 0 LLAMA_ROPE_SCALING_TYPE_LINEAR = 1 LLAMA_ROPE_SCALING_TYPE_YARN = 2 LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3 -LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN +LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE # enum llama_pooling_type { # LLAMA_POOLING_TYPE_UNSPECIFIED = -1, @@ -442,7 +460,7 @@ # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU # LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs +# LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported # }; LLAMA_SPLIT_MODE_NONE = 0 LLAMA_SPLIT_MODE_LAYER = 1 @@ -516,18 +534,21 @@ class llama_token_data_array(ctypes.Structure): ) -# // Input data for llama_decode +# // Input data for llama_encode/llama_decode # // A llama_batch object can contain input about one or many sequences # // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens # // # // - token : the token ids of the input (used when embd is NULL) # // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) # // - pos : the positions of the respective token in the sequence -# // (if set to NULL, the token position will be tracked automatically by llama_decode) +# // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode) # // - seq_id : the sequence to which the respective token belongs # // (if set to NULL, the sequence ID will be assumed to be 0) # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output -# // (if set to NULL, only the logits for last token will be returned) +# // (if set to NULL: +# // - if embeddings: all tokens are output +# // - if not: only the last token is output +# // ) # // # typedef struct llama_batch { # int32_t n_tokens; @@ -537,10 +558,10 @@ class llama_token_data_array(ctypes.Structure): # llama_pos * pos; # int32_t * n_seq_id; # llama_seq_id ** seq_id; -# int8_t * logits; // TODO: rename this to "output" +# int8_t * logits; // TODO: rename this to "output" # } llama_batch; class llama_batch(ctypes.Structure): - """Input data for llama_decode + """Input data for llama_encode/llama_decode A llama_batch object can contain input about one or many sequences @@ -628,17 +649,23 @@ class llama_model_kv_override(ctypes.Structure): value: Union[int, float, bool, bytes] +# struct llama_model_tensor_buft_override { +# const char * pattern; +# ggml_backend_buffer_type_t buft; +# }; + + # struct llama_model_params { # // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) # ggml_backend_dev_t * devices; +# // NULL-terminated list of buffer types to use for tensors that match a pattern +# const struct llama_model_tensor_buft_override * tensor_buft_overrides; + # int32_t n_gpu_layers; // number of layers to store in VRAM # enum llama_split_mode split_mode; // how to split the model across multiple GPUs -# // main_gpu interpretation depends on split_mode: -# // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model -# // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results -# // LLAMA_SPLIT_MODE_LAYER: ignored +# // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE # int32_t main_gpu; # // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() @@ -655,7 +682,6 @@ class llama_model_kv_override(ctypes.Structure): # // override key-value pairs of the model meta data # const struct llama_model_kv_override * kv_overrides; - # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible @@ -666,9 +692,11 @@ class llama_model_params(ctypes.Structure): """Parameters for llama_model Attributes: + devices (ctypes.Array[ggml_backend_dev_t]): NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) + tensor_buft_overrides (ctypes.Array[llama_model_tensor_buft_override]): NULL-terminated list of buffer types to use for tensors that match a pattern n_gpu_layers (int): number of layers to store in VRAM split_mode (int): how to split the model across multiple GPUs - main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored + main_gpu (int): the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback @@ -679,6 +707,8 @@ class llama_model_params(ctypes.Structure): check_tensors (bool): validate model tensor data""" if TYPE_CHECKING: + devices: CtypesArray[ctypes.c_void_p] # NOTE: unused + tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused n_gpu_layers: int split_mode: int main_gpu: int @@ -693,6 +723,7 @@ class llama_model_params(ctypes.Structure): _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused + ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -708,7 +739,7 @@ class llama_model_params(ctypes.Structure): # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations -# // https://github.com/ggerganov/llama.cpp/pull/7544 +# // https://github.com/ggml-org/llama.cpp/pull/7544 # struct llama_context_params { # uint32_t n_ctx; // text context, 0 = from model # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode @@ -721,7 +752,7 @@ class llama_model_params(ctypes.Structure): # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings -# // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +# // ref: https://github.com/ggml-org/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model # float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model # float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model @@ -729,7 +760,7 @@ class llama_model_params(ctypes.Structure): # float yarn_beta_fast; // YaRN low correction dim # float yarn_beta_slow; // YaRN high correction dim # uint32_t yarn_orig_ctx; // YaRN original context size -# float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default) +# float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default) # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; @@ -737,19 +768,21 @@ class llama_model_params(ctypes.Structure): # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] -# // Keep the booleans together to avoid misalignment during copy-by-value. -# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) -# bool embeddings; // if true, extract embeddings (together with logits) -# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU -# bool flash_attn; // whether to use flash attention [EXPERIMENTAL] -# bool no_perf; // whether to measure performance timings - - # // Abort callback # // if it returns true, execution of llama_decode() will be aborted # // currently works only with CPU execution # ggml_abort_callback abort_callback; # void * abort_callback_data; + +# // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. +# bool embeddings; // if true, extract embeddings (together with logits) +# bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU +# bool flash_attn; // use flash attention [EXPERIMENTAL] +# bool no_perf; // measure performance timings +# bool op_offload; // offload host tensor operations to device +# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) +# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases +# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -771,18 +804,19 @@ class llama_context_params(ctypes.Structure): yarn_beta_fast (float): YaRN low correction dim yarn_beta_slow (float): YaRN high correction dim yarn_orig_ctx (int): YaRN original context size - defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default) + defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default) cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval type_k (int): data type for K cache type_v (int): data type for V cache - logits_all (bool): the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) + abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted + abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU flash_attn (bool): whether to use flash attention no_perf (bool): whether to measure performance timings - abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted - abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback + op_offload (bool): offload host tensor operations to device + swa_full (bool): use full-size SWA cache """ if TYPE_CHECKING: @@ -807,13 +841,14 @@ class llama_context_params(ctypes.Structure): cb_eval_user_data: ctypes.c_void_p type_k: int type_v: int - logits_all: bool + abort_callback: Callable[[ctypes.c_void_p], bool] + abort_callback_data: ctypes.c_void_p embeddings: bool offload_kqv: bool flash_attn: bool no_perf: bool - abort_callback: Callable[[ctypes.c_void_p], bool] - abort_callback_data: ctypes.c_void_p + op_offload: bool + swa_full: bool _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -837,13 +872,14 @@ class llama_context_params(ctypes.Structure): ("cb_eval_user_data", ctypes.c_void_p), ("type_k", ctypes.c_int), ("type_v", ctypes.c_int), - ("logits_all", ctypes.c_bool), + ("abort_callback", ggml_abort_callback), + ("abort_callback_data", ctypes.c_void_p), ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), ("flash_attn", ctypes.c_bool), ("no_perf", ctypes.c_bool), - ("abort_callback", ggml_abort_callback), - ("abort_callback_data", ctypes.c_void_p), + ("op_offload", ctypes.c_bool), + ("swa_full", ctypes.c_bool), ] @@ -865,17 +901,19 @@ class llama_context_params(ctypes.Structure): # // model quantization parameters # typedef struct llama_model_quantize_params { -# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# enum ggml_type output_tensor_type; // output tensor type -# enum ggml_type token_embedding_type; // token embeddings tensor type -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored -# bool pure; // quantize all tensors to the default type -# bool keep_split; // quantize to the same number of shards -# void * imatrix; // pointer to importance matrix data -# void * kv_overrides; // pointer to vector containing overrides +# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# enum ggml_type output_tensor_type; // output tensor type +# enum ggml_type token_embedding_type; // token embeddings tensor type +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored +# bool pure; // quantize all tensors to the default type +# bool keep_split; // quantize to the same number of shards +# void * imatrix; // pointer to importance matrix data +# void * kv_overrides; // pointer to vector containing overrides +# void * tensor_types; // pointer to vector containing tensor types +# void * prune_layers; // pointer to vector containing layer indices to prune # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -892,6 +930,8 @@ class llama_model_quantize_params(ctypes.Structure): keep_split (bool): quantize to the same number of shards imatrix (ctypes.c_void_p): pointer to importance matrix data kv_overrides (ctypes.c_void_p): pointer to vector containing overrides + tensor_types (ctypes.c_void_p): pointer to vector containing tensor types + prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune """ if TYPE_CHECKING: @@ -906,6 +946,8 @@ class llama_model_quantize_params(ctypes.Structure): keep_split: bool imatrix: ctypes.c_void_p kv_overrides: ctypes.c_void_p + tensor_types: ctypes.c_void_p + prune_layers: ctypes.c_void_p _fields_ = [ ("nthread", ctypes.c_int32), @@ -919,6 +961,8 @@ class llama_model_quantize_params(ctypes.Structure): ("keep_split", ctypes.c_bool), ("imatrix", ctypes.c_void_p), ("kv_overrides", ctypes.c_void_p), + ("tensor_types", ctypes.c_void_p), + ("prune_layers", ctypes.c_void_p), ] @@ -1029,7 +1073,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params: # // Initialize the llama + ggml backend # // If numa is true, use NUMA optimizations # // Call once at the start of the program -# LLAMA_API void llama_backend_init(bool numa); # LLAMA_API void llama_backend_init(void); @ctypes_function( "llama_backend_init", @@ -1038,7 +1081,6 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params: ) def llama_backend_init(): """Initialize the llama + ggml backend - If numa is true, use NUMA optimizations Call once at the start of the program""" ... @@ -1152,7 +1194,21 @@ def llama_model_load_from_splits( ... -# LLAMA_API void llama_free_model(struct llama_model * model); +# LLAMA_API void llama_model_save_to_file( +# const struct llama_model * model, +# const char * path_model); +@ctypes_function( + "llama_model_save_to_file", + [llama_model_p_ctypes, ctypes.c_char_p], + None, +) +def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /): + """Save the model to a file""" + ... + + +# DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model), +# "use llama_model_free instead"); @ctypes_function( "llama_free_model", [llama_model_p_ctypes], @@ -1229,6 +1285,12 @@ def llama_max_devices() -> int: ... +# LLAMA_API size_t llama_max_parallel_sequences(void); +@ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t) +def llama_max_parallel_sequences() -> int: + ... + + # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) def llama_supports_mmap() -> bool: @@ -1277,8 +1339,6 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... - - # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: @@ -1315,12 +1375,30 @@ def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... -# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); +# LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); +@ctypes_function("llama_get_memory", [llama_context_p_ctypes], llama_memory_t_ctypes) +def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]: + """Get the memory for the context""" + ... + + +# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) def llama_pooling_type(ctx: llama_context_p, /) -> int: ... +# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); +@ctypes_function( + "llama_get_kv_self", + [llama_context_p_ctypes], + llama_kv_cache_p_ctypes, +) +def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: + """Get the KV cache for self-attention (DEPRECATED)""" + ... + + # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes) def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: @@ -1357,6 +1435,18 @@ def llama_model_n_head(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); +@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_head_kv(model: llama_model_p, /) -> int: + ... + + +# LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); +@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_swa(model: llama_model_p, /) -> int: + ... + + # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) @@ -1364,9 +1454,26 @@ def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: ... +# // Returns the number of classifier outputs (only valid for classifier models) +# // Undefined behavior for non-classifier models +# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model); +@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32) +def llama_model_n_cls_out(model: llama_model_p, /) -> int: + """Returns the number of classifier outputs (only valid for classifier models)""" + ... + + +# // Returns label of classifier output by index ( Optional[bytes]: + """Returns label of classifier output by index. Returns None if no label provided""" + ... + + # LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); -@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_vocab_type(model: llama_model_p, /) -> int: +@ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int) +def llama_vocab_type(vocab: llama_vocab_p, /) -> int: ... @@ -1564,6 +1671,10 @@ def llama_model_quantize( ... +# // +# // Adapters +# // + # // Load a LoRA adapter from file # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( # struct llama_model * model, @@ -1687,148 +1798,256 @@ def llama_apply_adapter_cvec( # // -# // KV cache +# // Memory # // +# // Clear the memory contents +# // If data == true, the data buffers will also be cleared together with the metadata +# LLAMA_API void llama_memory_clear( +# llama_memory_t mem, +# bool data); +@ctypes_function( + "llama_memory_clear", + [llama_memory_t_ctypes, ctypes.c_bool], + None, +) +def llama_memory_clear(mem: llama_memory_t, data: bool, /): + """Clear the memory contents + If data == true, the data buffers will also be cleared together with the metadata""" + ... -# // Information associated with an individual cell in the KV cache view. -# struct llama_kv_cache_view_cell { -# // The position for this cell. Takes KV cache shifts into account. -# // May be negative if the cell is not populated. -# llama_pos pos; -# }; -class llama_kv_cache_view_cell(ctypes.Structure): - """Information associated with an individual cell in the KV cache view. - - Attributes: - pos (llama_pos): The position for this cell. Takes KV cache shifts into account. - May be negative if the cell is not populated.""" - - if TYPE_CHECKING: - pos: llama_pos - - _fields_ = [("pos", llama_pos)] - - -# // An updateable view of the KV cache. -# struct llama_kv_cache_view { -# // Number of KV cache cells. This will be the same as the context size. -# int32_t n_cells; -# // Maximum number of sequences that can exist in a cell. It's not an error -# // if there are more sequences in a cell than this value, however they will -# // not be visible in the view cells_sequences. -# int32_t n_seq_max; +# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) +# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails +# // seq_id < 0 : match any sequence +# // p0 < 0 : [0, p1] +# // p1 < 0 : [p0, inf) +# LLAMA_API bool llama_memory_seq_rm( +# llama_memory_t mem, +# llama_seq_id seq_id, +# llama_pos p0, +# llama_pos p1); +@ctypes_function( + "llama_memory_seq_rm", + [ + llama_memory_t_ctypes, + llama_seq_id, + llama_pos, + llama_pos, + ], + ctypes.c_bool, +) +def llama_memory_seq_rm( + mem: llama_memory_t, + seq_id: Union[llama_seq_id, int], + p0: Union[llama_pos, int], + p1: Union[llama_pos, int], + /, +) -> bool: + """Removes all tokens that belong to the specified sequence and have positions in [p0, p1) -# // Number of tokens in the cache. For example, if there are two populated -# // cells, the first with 1 sequence id in it and the second with 2 sequence -# // ids then you'll have 3 tokens. -# int32_t token_count; + Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails -# // Number of populated cache cells. -# int32_t used_cells; + seq_id < 0 : match any sequence + p0 < 0 : [0, p1] + p1 < 0 : [p0, inf)""" + ... -# // Maximum contiguous empty slots in the cache. -# int32_t max_contiguous; -# // Index to the start of the max_contiguous slot range. Can be negative -# // when cache is full. -# int32_t max_contiguous_idx; +# // Copy all tokens that belong to the specified sequence to another sequence +# // p0 < 0 : [0, p1] +# // p1 < 0 : [p0, inf) +# LLAMA_API void llama_memory_seq_cp( +# llama_memory_t mem, +# llama_seq_id seq_id_src, +# llama_seq_id seq_id_dst, +# llama_pos p0, +# llama_pos p1); +@ctypes_function( + "llama_memory_seq_cp", + [ + llama_memory_t_ctypes, + llama_seq_id, + llama_seq_id, + llama_pos, + llama_pos, + ], + None, +) +def llama_memory_seq_cp( + mem: llama_memory_t, + seq_id_src: Union[llama_seq_id, int], + seq_id_dst: Union[llama_seq_id, int], + p0: Union[llama_pos, int], + p1: Union[llama_pos, int], + /, +): + """Copy all tokens that belong to the specified sequence to another sequence + p0 < 0 : [0, p1] + p1 < 0 : [p0, inf)""" + ... -# // Information for an individual cell. -# struct llama_kv_cache_view_cell * cells; +# // Removes all tokens that do not belong to the specified sequence +# LLAMA_API void llama_memory_seq_keep( +# llama_memory_t mem, +# llama_seq_id seq_id); +@ctypes_function( + "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None +) +def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /): + """Removes all tokens that do not belong to the specified sequence""" + ... -# // The sequences for each cell. There will be n_seq_max items per cell. -# llama_seq_id * cells_sequences; -# }; -class llama_kv_cache_view(ctypes.Structure): - if TYPE_CHECKING: - n_cells: int - n_max_seq: int - token_count: int - used_cells: int - max_contiguous: int - max_contiguous_idx: int - cells: CtypesArray[llama_kv_cache_view_cell] - cells_sequences: CtypesArray[llama_seq_id] - _fields_ = [ - ("n_cells", ctypes.c_int32), - ("n_max_seq", ctypes.c_int32), - ("token_count", ctypes.c_int32), - ("used_cells", ctypes.c_int32), - ("max_contiguous", ctypes.c_int32), - ("max_contiguous_idx", ctypes.c_int32), - ("cells", ctypes.POINTER(llama_kv_cache_view_cell)), - ("cells_sequences", ctypes.POINTER(llama_seq_id)), - ] +# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) +# // p0 < 0 : [0, p1] +# // p1 < 0 : [p0, inf) +# LLAMA_API void llama_memory_seq_add( +# llama_memory_t mem, +# llama_seq_id seq_id, +# llama_pos p0, +# llama_pos p1, +# llama_pos delta); +@ctypes_function( + "llama_memory_seq_add", + [ + llama_memory_t_ctypes, + llama_seq_id, + llama_pos, + llama_pos, + llama_pos, + ], + None, +) +def llama_memory_seq_add( + mem: llama_memory_t, + seq_id: Union[llama_seq_id, int], + p0: Union[llama_pos, int], + p1: Union[llama_pos, int], + delta: Union[llama_pos, int], + /, +): + """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) + p0 < 0 : [0, p1] + p1 < 0 : [p0, inf)""" + ... -llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view) +# // Integer division of the positions by factor of `d > 1` +# // p0 < 0 : [0, p1] +# // p1 < 0 : [p0, inf) +# LLAMA_API void llama_memory_seq_div( +# llama_memory_t mem, +# llama_seq_id seq_id, +# llama_pos p0, +# llama_pos p1, +# int d); +@ctypes_function( + "llama_memory_seq_div", + [ + llama_memory_t_ctypes, + llama_seq_id, + llama_pos, + llama_pos, + ctypes.c_int, + ], + None, +) +def llama_memory_seq_div( + mem: llama_memory_t, + seq_id: Union[llama_seq_id, int], + p0: Union[llama_pos, int], + p1: Union[llama_pos, int], + d: Union[ctypes.c_int, int], + /, +): + """Integer division of the positions by factor of `d > 1` + p0 < 0 : [0, p1] + p1 < 0 : [p0, inf)""" + ... -# // Create an empty KV cache view. (use only for debugging purposes) -# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max); +# // Returns the smallest position present in the memory for the specified sequence +# // This is typically non-zero only for SWA caches +# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory +# // Return -1 if the sequence is empty +# LLAMA_API llama_pos llama_memory_seq_pos_min( +# llama_memory_t mem, +# llama_seq_id seq_id); @ctypes_function( - "llama_kv_cache_view_init", - [llama_context_p_ctypes, ctypes.c_int32], - llama_kv_cache_view, + "llama_memory_seq_pos_min", [llama_memory_t_ctypes, llama_seq_id], llama_pos ) -def llama_kv_cache_view_init( - ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], / -) -> llama_kv_cache_view: - """Create an empty KV cache view. (use only for debugging purposes)""" +def llama_memory_seq_pos_min( + mem: llama_memory_t, seq_id: Union[llama_seq_id, int], / +) -> int: + """Returns the smallest position present in the memory for the specified sequence + This is typically non-zero only for SWA caches + Return -1 if the sequence is empty""" ... -# // Free a KV cache view. (use only for debugging purposes) -# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); -@ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None) -def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /): # type: ignore - """Free a KV cache view. (use only for debugging purposes)""" +# // Returns the largest position present in the memory for the specified sequence +# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory +# // Return -1 if the sequence is empty +# LLAMA_API llama_pos llama_memory_seq_pos_max( +# llama_memory_t mem, +# llama_seq_id seq_id); +@ctypes_function( + "llama_memory_seq_pos_max", [llama_memory_t_ctypes, llama_seq_id], llama_pos +) +def llama_memory_seq_pos_max( + mem: llama_memory_t, seq_id: Union[llama_seq_id, int], / +) -> int: + """Returns the largest position present in the memory for the specified sequence + Return -1 if the sequence is empty""" ... -# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) -# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); -@ctypes_function( - "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None -) -def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /): # type: ignore - """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)""" +# // Check if the memory supports shifting +# LLAMA_API bool llama_memory_can_shift(llama_memory_t mem); +@ctypes_function("llama_memory_can_shift", [llama_memory_t_ctypes], ctypes.c_bool) +def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: + """Check if the memory supports shifting""" ... +# // +# // KV cache for self-attention (TODO: deprecate in favor of llama_memory) +# // + # // Returns the number of tokens in the KV cache (slow, use only for debug) # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); +# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), +# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); @ctypes_function( - "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32 + "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 ) -def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) - If a KV cell has multiple sequences assigned to it, it will be counted multiple times - """ +def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: + """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" ... # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) -# LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); +# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), +# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); @ctypes_function( - "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32 + "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 ) -def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)""" +def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: + """Returns the number of used KV cells (DEPRECATED)""" ... # // Clear the KV cache - both cell info is erased and KV data is zeroed -# LLAMA_API void llama_kv_cache_clear( -# struct llama_context * ctx); -@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None) -def llama_kv_cache_clear(ctx: llama_context_p, /): - """Clear the KV cache""" +# DEPRECATED(LLAMA_API void llama_kv_self_clear( +# struct llama_context * ctx), +# "Use llama_memory_clear() instead"); +@ctypes_function( + "llama_kv_self_clear", [llama_context_p_ctypes], None +) +def llama_kv_self_clear(ctx: llama_context_p, /): + """Clear the KV cache (DEPRECATED)""" ... @@ -1837,13 +2056,14 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): # // seq_id < 0 : match any sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API bool llama_kv_cache_seq_rm( +# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, -# llama_pos p1); +# llama_pos p1), +# "Use llama_memory_seq_rm() instead"); @ctypes_function( - "llama_kv_cache_seq_rm", + "llama_kv_self_seq_rm", [ llama_context_p_ctypes, llama_seq_id, @@ -1852,20 +2072,14 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): ], ctypes.c_bool, ) -def llama_kv_cache_seq_rm( +def llama_kv_self_seq_rm( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], p1: Union[llama_pos, int], /, ) -> bool: - """Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - - Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails - - seq_id < 0 : match any sequence - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" + """Remove tokens from KV cache (DEPRECATED)""" ... @@ -1873,14 +2087,15 @@ def llama_kv_cache_seq_rm( # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_cp( +# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( # struct llama_context * ctx, # llama_seq_id seq_id_src, # llama_seq_id seq_id_dst, # llama_pos p0, -# llama_pos p1); +# llama_pos p1), +# "Use llama_memory_seq_cp() instead"); @ctypes_function( - "llama_kv_cache_seq_cp", + "llama_kv_self_seq_cp", [ llama_context_p_ctypes, llama_seq_id, @@ -1890,7 +2105,7 @@ def llama_kv_cache_seq_rm( ], None, ) -def llama_kv_cache_seq_cp( +def llama_kv_self_seq_cp( ctx: llama_context_p, seq_id_src: Union[llama_seq_id, int], seq_id_dst: Union[llama_seq_id, int], @@ -1898,39 +2113,37 @@ def llama_kv_cache_seq_cp( p1: Union[llama_pos, int], /, ): - """Copy all tokens that belong to the specified sequence to another sequence - Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" + """Copy tokens in KV cache (DEPRECATED)""" ... # // Removes all tokens that do not belong to the specified sequence -# LLAMA_API void llama_kv_cache_seq_keep( +# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( # struct llama_context * ctx, -# llama_seq_id seq_id); +# llama_seq_id seq_id), +# "Use llama_memory_seq_keep() instead"); @ctypes_function( - "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None + "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None ) -def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Removes all tokens that do not belong to the specified sequence""" +def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): + """Keep only specified sequence in KV cache (DEPRECATED)""" ... # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) # // If the KV cache is RoPEd, the KV data is updated accordingly: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_add( +# DEPRECATED(LLAMA_API void llama_kv_self_seq_add( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, -# llama_pos delta); +# llama_pos delta), +# "Use llama_memory_seq_add() instead"); @ctypes_function( - "llama_kv_cache_seq_add", + "llama_kv_self_seq_add", [ llama_context_p_ctypes, llama_seq_id, @@ -1940,7 +2153,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in ], None, ) -def llama_kv_cache_seq_add( +def llama_kv_self_seq_add( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1948,27 +2161,24 @@ def llama_kv_cache_seq_add( delta: Union[llama_pos, int], /, ): - """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - If the KV cache is RoPEd, the KV data is updated accordingly: - - lazily on next llama_decode() - - explicitly with llama_kv_cache_update() - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" + """Add delta to sequence positions in KV cache (DEPRECATED)""" ... # // Integer division of the positions by factor of `d > 1` -# // If the KV cache is RoPEd, the KV data is updated accordingly +# // If the KV cache is RoPEd, the KV data is updated accordingly: +# // - lazily on next llama_decode() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_div( +# DEPRECATED(void llama_kv_self_seq_div( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, -# int d); +# int d), +# "Use llama_memory_seq_div() instead"); @ctypes_function( - "llama_kv_cache_seq_div", + "llama_kv_self_seq_div", [ llama_context_p_ctypes, llama_seq_id, @@ -1978,7 +2188,7 @@ def llama_kv_cache_seq_add( ], None, ) -def llama_kv_cache_seq_div( +def llama_kv_self_seq_div( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1986,40 +2196,71 @@ def llama_kv_cache_seq_div( d: Union[ctypes.c_int, int], /, ): - """Integer division of the positions by factor of `d > 1` - If the KV cache is RoPEd, the KV data is updated accordingly - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" + """Divide sequence positions in KV cache (DEPRECATED)""" + ... + + +# // Returns the smallest position present in the KV cache for the specified sequence +# // This is typically non-zero only for SWA caches +# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache +# // Return -1 if the sequence is empty +# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( +# struct llama_context * ctx, +# llama_seq_id seq_id), +# "Use llama_memory_seq_pos_min() instead"); +@ctypes_function( + "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos +) +def llama_kv_self_seq_pos_min( + ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / +) -> int: + """Returns the smallest position in KV cache for sequence (DEPRECATED)""" + ... + + +# // Returns the largest position present in the KV cache for the specified sequence +# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache +# // Return -1 if the sequence is empty +# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( +# struct llama_context * ctx, +# llama_seq_id seq_id), +# "Use llama_memory_seq_pos_max() instead"); +@ctypes_function( + "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos +) +def llama_kv_self_seq_pos_max( + ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / +) -> int: + """Returns the largest position in KV cache for sequence (DEPRECATED)""" ... # // Defragment the KV cache # // This will be applied: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() -# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None) -def llama_kv_cache_defrag(ctx: llama_context_p, /): - """Defragment the KV cache - This will be applied: - - lazily on next llama_decode() - - explicitly with llama_kv_cache_update()""" +# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), +# "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); +@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) +def llama_kv_self_defrag(ctx: llama_context_p, /): + """Defragment the KV cache (DEPRECATED)""" ... -# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None) -def llama_kv_cache_update(ctx: llama_context_p, /): - """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)""" +# // Check if the context supports KV cache shifting +# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), +# "use llama_memory_can_shift() instead"); +@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) +def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: + """Check if the context supports KV cache shifting (DEPRECATED)""" ... -# // Check if the context supports KV cache shifting -# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting""" +# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) +# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), +# "simply remove this call, updates are applied lazily on the next llama_decode()"); +@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) +def llama_kv_self_update(ctx: llama_context_p, /): + """Apply the KV cache updates (DEPRECATED)""" ... @@ -2027,14 +2268,13 @@ def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool: # // State / sessions # // - # // Returns the *actual* size in bytes of the state -# // (logits, embedding and kv_cache) +# // (logits, embedding and memory) # // Only use when saving the state, not when restoring it, otherwise the size may be too small. # LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t) def llama_state_get_size(ctx: llama_context_p, /) -> int: - """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens""" + """Returns the *actual* size in bytes of the state (logits, embedding and memory)""" ... @@ -2042,8 +2282,7 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int: # "use llama_state_get_size instead"); @ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t) def llama_get_state_size(ctx: llama_context_p, /) -> int: - """Returns the maximum size in bytes of the state (rng, logits, embedding - and kv_cache) - will often be smaller after compacting tokens""" + """Returns the size in bytes of the state (DEPRECATED)""" ... @@ -2090,9 +2329,7 @@ def llama_state_get_data( def llama_copy_state_data( ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], / ) -> int: - """Copies the state to the specified destination address. - Destination needs to have allocated enough memory. - Returns the number of bytes copied""" + """Copies the state to the specified destination address (DEPRECATED)""" ... @@ -2130,7 +2367,7 @@ def llama_state_set_data( def llama_set_state_data( ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], / ) -> int: - """Set the state reading from the specified address""" + """Set the state reading from the specified address (DEPRECATED)""" ... @@ -2179,7 +2416,7 @@ def llama_state_load_file( ctypes.c_size_t, ctypes.POINTER(ctypes.c_size_t), ], - ctypes.c_size_t, + ctypes.c_bool, ) def llama_load_session_file( ctx: llama_context_p, @@ -2188,7 +2425,7 @@ def llama_load_session_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: +) -> bool: ... @@ -2231,7 +2468,7 @@ def llama_state_save_file( llama_token_p, ctypes.c_size_t, ], - ctypes.c_size_t, + ctypes.c_bool, ) def llama_save_session_file( ctx: llama_context_p, @@ -2239,11 +2476,11 @@ def llama_save_session_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: +) -> bool: ... -# // Get the exact size needed to copy the KV cache of a single sequence +# // Get the exact size needed to copy the state of a single sequence # LLAMA_API size_t llama_state_seq_get_size( # struct llama_context * ctx, # llama_seq_id seq_id); @@ -2253,11 +2490,11 @@ def llama_save_session_file( ctypes.c_size_t, ) def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int: - """Get the exact size needed to copy the KV cache of a single sequence""" + """Get the exact size needed to copy the state of a single sequence""" ... -# // Copy the KV cache of a single sequence into the specified buffer +# // Copy the state of a single sequence into the specified buffer # LLAMA_API size_t llama_state_seq_get_data( # struct llama_context * ctx, # uint8_t * dst, @@ -2280,7 +2517,7 @@ def llama_state_seq_get_data( seq_id: llama_seq_id, /, ) -> int: - """Copy the KV cache of a single sequence into the specified buffer""" + """Copy the state of a single sequence into the specified buffer""" ... @@ -2310,7 +2547,7 @@ def llama_state_seq_set_data( dest_seq_id: llama_seq_id, /, ) -> int: - """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence""" + """Copy the sequence data into the specified sequence""" ... @@ -2377,7 +2614,6 @@ def llama_state_seq_load_file( # // Decoding # // - # // Return batch for single sequence of tokens # // The sequence ID will be fixed to 0 # // The position of the tokens will be tracked automatically by llama_decode @@ -2400,7 +2636,7 @@ def llama_batch_get_one( n_tokens: Union[ctypes.c_int, int], /, ) -> llama_batch: - """Return batch for single sequence of tokens starting at pos_0 + """Return batch for single sequence of tokens NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it """ @@ -2445,35 +2681,46 @@ def llama_batch_free(batch: llama_batch, /): ... -# // Processes a batch of tokens with the ecoder part of the encoder-decoder model. -# // Stores the encoder output internally for later use by the decoder cross-attention layers. +# // Process a batch of tokens. +# // In contrast to llama_decode() - this call does not use KV cache. +# // For encode-decoder contexts, processes the batch using the encoder. +# // Can store the encoder output internally for later use by the decoder's cross-attention layers. # // 0 - success -# // < 0 - error +# // < 0 - error. the memory state is restored to the state before this call # LLAMA_API int32_t llama_encode( # struct llama_context * ctx, # struct llama_batch batch); @ctypes_function("llama_encode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32) def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int: - """Processes a batch of tokens with the ecoder part of the encoder-decoder model. - Stores the encoder output internally for later use by the decoder cross-attention layers. + """Process a batch of tokens using the encoder. 0 - success < 0 - error""" ... +# // Process a batch of tokens. +# // Requires the context to have a memory. +# // For encode-decoder contexts, processes the batch using the decoder. # // Positive return values does not mean a fatal error, but rather a warning. -# // 0 - success -# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) -# // < 0 - error +# // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context +# // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() +# // Upon other return values, the memory state is restored to the state before this call +# // 0 - success +# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) +# // 2 - aborted (processed ubatches will remain in the context's memory) +# // -1 - invalid input batch +# // < -1 - fatal error (processed ubatches will remain in the context's memory) # LLAMA_API int32_t llama_decode( # struct llama_context * ctx, # struct llama_batch batch); @ctypes_function("llama_decode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32) def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int: - """Positive return values does not mean a fatal error, but rather a warning. + """Process a batch of tokens. 0 - success 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) - < 0 - error""" + 2 - aborted (processed ubatches will remain in the context's memory) + -1 - invalid input batch + < -1 - fatal error (processed ubatches will remain in the context's memory)""" ... @@ -2519,13 +2766,12 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int: ... -# // Set whether the model is in embeddings mode or not -# // If true, embeddings will be returned but logits will not +# // Set whether the context outputs embeddings or not +# // TODO: rename to avoid confusion with llama_get_embeddings() # LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); @ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None) def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /): - """Set whether the model is in embeddings model or not - If true, embeddings will be returned but logits will not""" + """Set whether the context outputs embeddings or not""" ... @@ -2539,6 +2785,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): ... +# // Set whether the model is in warmup mode or not +# // If true, all model tensors are activated during llama_decode() to load and cache their weights. +# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): + """Set whether the model is in warmup mode or not + If true, all model tensors are activated during llama_decode() to load and cache their weights.""" + ... + + # // Set abort callback # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); @ctypes_function( @@ -2644,7 +2900,7 @@ def llama_get_embeddings_ith( # // Get the embeddings for a sequence id # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE -# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence +# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence # // otherwise: float[n_embd] (1-dimensional) # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); @ctypes_function( @@ -2665,7 +2921,6 @@ def llama_get_embeddings_seq( # // Vocab # // - # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token); @ctypes_function( "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p @@ -2719,8 +2974,6 @@ def llama_vocab_is_control( # // Special tokens - - # LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence @ctypes_function("llama_vocab_bos", [llama_vocab_p_ctypes], llama_token) def llama_vocab_bos(vocab: llama_vocab_p, /) -> llama_token: @@ -2762,6 +3015,7 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token: """padding""" ... + # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab); @ctypes_function( "llama_vocab_get_add_bos", @@ -2782,6 +3036,16 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: ... +# LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab); +@ctypes_function( + "llama_vocab_get_add_sep", + [llama_vocab_p_ctypes], + ctypes.c_bool, +) +def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: + ... + + # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab); @ctypes_function( "llama_vocab_fim_pre", @@ -2842,7 +3106,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ... - +# DEPRECATED functions # DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead"); @ctypes_function( "llama_token_get_text", @@ -3056,11 +3320,11 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: # // The API is thread-safe. # // - # /// @details Convert the provided text into tokens. # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. # /// @return Returns the number of tokens on success, no more than n_tokens_max # /// @return Returns a negative number on failure - the number of tokens that would have been returned +# /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit) # /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. # /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated # /// as plaintext. Does not insert a leading space. @@ -3103,7 +3367,7 @@ def llama_tokenize( text_len: The length of the text. tokens: The tokens pointer must be large enough to hold the resulting tokens. n_max_tokens: The maximum number of tokens to return. - add_special: Allow adding special tokenns if the model is configured to do so. + add_special: Allow adding special tokens if the model is configured to do so. parse_special: Allow parsing special tokens. Returns: @@ -3161,23 +3425,6 @@ def llama_token_to_piece( ... -# # // check if token0 is contained as a prefix in token1 -# # LLAMA_API bool llama_token_is_prefix( -# # const struct llama_model * model, -# # llama_token token0, -# # llama_token token1); -# @ctypes_function( -# "llama_token_is_prefix", -# [llama_model_p_ctypes, llama_token, llama_token], -# ctypes.c_bool, -# ) -# def llama_token_is_prefix( -# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], / -# ) -> bool: -# """Check if token0 is contained as a prefix in token1""" -# ... - - # /// @details Convert the provided tokens into text (inverse of llama_tokenize()). # /// @param text The char pointer must be large enough to hold the resulting text. # /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -3185,7 +3432,7 @@ def llama_token_to_piece( # /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so. # /// @param unparse_special If true, special tokens are rendered in the output. # LLAMA_API int32_t llama_detokenize( -# const struct llama_model * model, +# const struct llama_vocab * vocab, # const llama_token * tokens, # int32_t n_tokens, # char * text, @@ -3195,7 +3442,7 @@ def llama_token_to_piece( @ctypes_function( "llama_detokenize", [ - llama_model_p_ctypes, + llama_vocab_p_ctypes, ctypes.POINTER(llama_token), ctypes.c_int32, ctypes.c_char_p, @@ -3206,7 +3453,7 @@ def llama_token_to_piece( ctypes.c_int32, ) def llama_detokenize( - model: llama_model_p, + vocab: llama_vocab_p, tokens: CtypesArray[llama_token], n_tokens: Union[ctypes.c_int, int], text: bytes, @@ -3218,7 +3465,7 @@ def llama_detokenize( """Convert the provided tokens into text (inverse of llama_tokenize()). Args: - model: The model to use for tokenization. + vocab: The vocabulary to use for tokenization. tokens: The tokens to convert. n_tokens: The number of tokens. text: The buffer to write the text to. @@ -3232,11 +3479,10 @@ def llama_detokenize( # // Chat templates # // - # /// Apply chat template. Inspired by hf apply_chat_template() on python. # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" -# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template -# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. +# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template +# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model's default chat template will be used instead. # /// @param chat Pointer to a list of multiple llama_chat_message # /// @param n_msg Number of llama_chat_message in this chat # /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. @@ -3318,41 +3564,6 @@ def llama_chat_builtin_templates( # // # // Sampling API # // -# // Sample usage: -# // -# // // prepare the sampling chain at the start -# // auto sparams = llama_sampler_chain_default_params(); -# // -# // llama_sampler * smpl = llama_sampler_chain_init(sparams); -# // -# // llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50)); -# // llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); -# // llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8)); -# // -# // // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat" -# // // this sampler will be responsible to select the actual token -# // llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed)); -# // -# // ... -# // -# // // decoding loop: -# // while (...) { -# // ... -# // -# // llama_decode(ctx, batch); -# // -# // // sample from the logits of the last token in the batch -# // const llama_token id = llama_sampler_sample(smpl, ctx, -1); -# // -# // // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.) -# // llama_sampler_accept(smpl, id); -# // ... -# // } -# // -# // llama_sampler_free(smpl); -# // -# // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU). -# // # typedef void * llama_sampler_context_t; llama_sampler_context_t = ctypes.c_void_p @@ -3366,7 +3577,7 @@ def llama_chat_builtin_templates( # void (*reset) ( struct llama_sampler * smpl); // can be NULL # struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL # void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL -# + # // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph # //void (*apply_ggml) (struct llama_sampler * smpl, ...); # }; @@ -3375,8 +3586,8 @@ class llama_sampler_i(ctypes.Structure): # struct llama_sampler { -# struct llama_sampler_i * iface; -# llama_sampler_context_t ctx; +# const struct llama_sampler_i * iface; +# llama_sampler_context_t ctx; # }; class llama_sampler(ctypes.Structure): _fields_ = [ @@ -3410,6 +3621,18 @@ class llama_sampler(ctypes.Structure): # // mirror of llama_sampler_i: +# LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx); +@ctypes_function( + "llama_sampler_init", + [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t], + llama_sampler_p_ctypes, +) +def llama_sampler_init( + iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, / +) -> llama_sampler_p: + ... + + # LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); @ctypes_function( "llama_sampler_name", @@ -3475,7 +3698,7 @@ def llama_sampler_free(smpl: llama_sampler_p, /): # // llama_sampler_chain # // a type of llama_sampler that can chain multiple samplers one after another -# + # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); @ctypes_function( "llama_sampler_chain_init", @@ -3533,7 +3756,7 @@ def llama_sampler_chain_remove( # // available samplers: -# + # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) def llama_sampler_init_greedy() -> llama_sampler_p: @@ -3549,13 +3772,14 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p: # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), -# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); +# "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) def llama_sampler_init_softmax() -> llama_sampler_p: ... # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# /// Setting k <= 0 makes this a noop # LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes) def llama_sampler_init_top_k(k: int) -> llama_sampler_p: @@ -3573,7 +3797,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ... -# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 +# /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841 # LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep); @ctypes_function( "llama_sampler_init_min_p", @@ -3595,6 +3819,7 @@ def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ... +# /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf # LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes) def llama_sampler_init_temp(t: float) -> llama_sampler_p: @@ -3627,12 +3852,18 @@ def llama_sampler_init_xtc( ... +# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641 +# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n); +@ctypes_function( + "llama_sampler_init_top_n_sigma", + [ctypes.c_float], + llama_sampler_p_ctypes, +) +def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: + ... + + # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. -# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat( # int32_t n_vocab, # uint32_t seed, @@ -3651,10 +3882,6 @@ def llama_sampler_init_mirostat( # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. # LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2( # uint32_t seed, # float tau, @@ -3670,6 +3897,7 @@ def llama_sampler_init_mirostat_v2( ... +# /// @details Intializes a GBNF grammar, see grammars/README.md for details. # LLAMA_API struct llama_sampler * llama_sampler_init_grammar( # const struct llama_vocab * vocab, # const char * grammar_str, @@ -3685,6 +3913,76 @@ def llama_sampler_init_grammar( ... +# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy( +# const struct llama_vocab * vocab, +# const char * grammar_str, +# const char * grammar_root, +# const char ** trigger_words, +# size_t num_trigger_words, +# const llama_token * trigger_tokens, +# size_t num_trigger_tokens), +# "use llama_sampler_init_grammar_lazy_patterns instead"); +@ctypes_function( + "llama_sampler_init_grammar_lazy", + [ + llama_vocab_p_ctypes, + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.POINTER(ctypes.c_char_p), + ctypes.c_size_t, + ctypes.POINTER(llama_token), + ctypes.c_size_t, + ], + llama_sampler_p_ctypes, +) +def llama_sampler_init_grammar_lazy( + vocab: llama_vocab_p, + grammar_str: bytes, + grammar_root: bytes, + trigger_words: CtypesArray[bytes], + num_trigger_words: int, + trigger_tokens: CtypesArray[llama_token], + num_trigger_tokens: int, + /, +) -> llama_sampler_p: + ... + + +# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639 +# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns( +# const struct llama_vocab * vocab, +# const char * grammar_str, +# const char * grammar_root, +# const char ** trigger_patterns, +# size_t num_trigger_patterns, +# const llama_token * trigger_tokens, +# size_t num_trigger_tokens); +@ctypes_function( + "llama_sampler_init_grammar_lazy_patterns", + [ + llama_vocab_p_ctypes, + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.POINTER(ctypes.c_char_p), + ctypes.c_size_t, + ctypes.POINTER(llama_token), + ctypes.c_size_t, + ], + llama_sampler_p_ctypes, +) +def llama_sampler_init_grammar_lazy_patterns( + vocab: llama_vocab_p, + grammar_str: bytes, + grammar_root: bytes, + trigger_patterns: CtypesArray[bytes], + num_trigger_patterns: int, + trigger_tokens: CtypesArray[llama_token], + num_trigger_tokens: int, + /, +) -> llama_sampler_p: + ... + + # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. # LLAMA_API struct llama_sampler * llama_sampler_init_penalties( # int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) @@ -3737,7 +4035,7 @@ def llama_sampler_init_dry( dry_base: float, dry_allowed_length: int, dry_penalty_last_n: int, - seq_breakers: CtypesArray[bytes], + seq_breakers, num_breakers: int, /, ) -> llama_sampler_p: @@ -3760,26 +4058,6 @@ def llama_sampler_init_logit_bias( # // this sampler is meant to be used for fill-in-the-middle infilling -# // it's supposed to be used after top_k + top_p sampling -# // -# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG -# // 2. combine probs of tokens that have the same prefix -# // -# // example: -# // -# // - before: -# // "hel": 0.5 -# // "hell": 0.2 -# // "hello": 0.1 -# // "dummy": 0.1 -# // -# // - after: -# // "hel": 0.8 -# // "dummy": 0.1 -# // -# // 3. discard non-EOG tokens with low prob -# // 4. if no tokens are left -> pick EOT -# // # LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab); @ctypes_function( "llama_sampler_init_infill", @@ -3802,15 +4080,6 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: # /// @details Sample and accept a token from the idx-th output of the last evaluation -# // -# // Shorthand for: -# // const auto * logits = llama_get_logits_ith(ctx, idx); -# // llama_token_data_array cur_p = { ... init from logits ... }; -# // llama_sampler_apply(smpl, &cur_p); -# // auto token = cur_p.data[cur_p.selected].id; -# // llama_sampler_accept(smpl, token); -# // return token; -# // Returns the sampled token # LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); @ctypes_function( "llama_sampler_sample", @@ -3827,10 +4096,7 @@ def llama_sampler_sample( # // Model split # // - # /// @details Build a split GGUF final path for this chunk. -# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" -# // Returns the split_path length. # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); @ctypes_function( "llama_split_path", @@ -3850,8 +4116,6 @@ def llama_split_path( # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. -# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" -# // Returns the split_prefix length. # LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); @ctypes_function( "llama_split_prefix", @@ -3899,16 +4163,13 @@ def llama_log_set( # // # // Performance utils # // -# // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. -# // - # struct llama_perf_context_data { # double t_start_ms; # double t_load_ms; # double t_p_eval_ms; # double t_eval_ms; -# + # int32_t n_p_eval; # int32_t n_eval; # }; @@ -3925,7 +4186,7 @@ class llama_perf_context_data(ctypes.Structure): # struct llama_perf_sampler_data { # double t_sample_ms; -# + # int32_t n_sample; # }; class llama_perf_sampler_data(ctypes.Structure): @@ -3996,3 +4257,83 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... +# // +# // training +# // + +# // function that returns whether or not a given tensor contains trainable parameters +# typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata); +llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) + +# // always returns true +# LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata); +@ctypes_function( + "llama_opt_param_filter_all", + [ctypes.c_void_p, ctypes.c_void_p], + ctypes.c_bool, +) +def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool: + ... + + +# struct llama_opt_params { +# uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0 + +# llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters +# void * param_filter_ud; // userdata for determining which tensors contain trainable parameters + +# ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters +# void * get_opt_pars_ud; // userdata for calculating optimizer parameters +# }; +class llama_opt_params(ctypes.Structure): + _fields_ = [ + ("n_ctx_train", ctypes.c_uint32), + ("param_filter", llama_opt_param_filter), + ("param_filter_ud", ctypes.c_void_p), + ("get_opt_pars", ctypes.c_void_p), # ggml_opt_get_optimizer_params - not implemented here + ("get_opt_pars_ud", ctypes.c_void_p), + ] + + +# LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); +@ctypes_function( + "llama_opt_init", + [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params], + None, +) +def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /): + ... + + +# LLAMA_API void llama_opt_epoch( +# struct llama_context * lctx, +# ggml_opt_dataset_t dataset, +# ggml_opt_result_t result_train, +# ggml_opt_result_t result_eval, +# int64_t idata_split, +# ggml_opt_epoch_callback callback_train, +# ggml_opt_epoch_callback callback_eval); +@ctypes_function( + "llama_opt_epoch", + [ + llama_context_p_ctypes, + ctypes.c_void_p, # ggml_opt_dataset_t + ctypes.c_void_p, # ggml_opt_result_t + ctypes.c_void_p, # ggml_opt_result_t + ctypes.c_int64, + ctypes.c_void_p, # ggml_opt_epoch_callback + ctypes.c_void_p, # ggml_opt_epoch_callback + ], + None, +) +def llama_opt_epoch( + lctx: llama_context_p, + dataset: ctypes.c_void_p, + result_train: ctypes.c_void_p, + result_eval: ctypes.c_void_p, + idata_split: int, + callback_train: ctypes.c_void_p, + callback_eval: ctypes.c_void_p, + /, +): + ... diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py new file mode 100644 index 000000000..a45f8f406 --- /dev/null +++ b/llama_cpp/mtmd_cpp.py @@ -0,0 +1,280 @@ +from __future__ import annotations + +import os +from ctypes import ( + c_bool, + c_char_p, + c_int, + c_uint8, + c_uint32, + c_float, + c_void_p, + c_size_t, + POINTER, + _Pointer, # type: ignore + Structure, + byref, +) +import pathlib +from typing import ( + Union, + NewType, + Optional, + TYPE_CHECKING, +) + +import llama_cpp.llama_cpp as llama_cpp + +from llama_cpp._ctypes_extensions import ( + load_shared_library, + ctypes_function_for_shared_library, +) + +if TYPE_CHECKING: + from llama_cpp._ctypes_extensions import ( + CtypesArray, + ) + + +# Specify the base name of the shared library to load +_libmtmd_base_name = "mtmd" +_libmtmd_override_path = os.environ.get("MTMD_CPP_LIB") +_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path() + +# Load the library +_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path) + +ctypes_function = ctypes_function_for_shared_library(_libmtmd) + +################################################ +# mtmd.h types +################################################ + +# Opaque types +mtmd_context_p = NewType("mtmd_context_p", int) +mtmd_context_p_ctypes = c_void_p + +mtmd_bitmap_p = NewType("mtmd_bitmap_p", int) +mtmd_bitmap_p_ctypes = c_void_p + +mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int) +mtmd_image_tokens_p_ctypes = c_void_p + +mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int) +mtmd_input_chunk_p_ctypes = c_void_p + +mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) +mtmd_input_chunks_p_ctypes = c_void_p + +# Enums +MTMD_INPUT_CHUNK_TYPE_TEXT = 0 +MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 +MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 + +# Structures +class mtmd_context_params(Structure): + _fields_ = [ + ("use_gpu", c_bool), + ("print_timings", c_bool), + ("n_threads", c_int), + ("verbosity", c_int), # ggml_log_level + ("image_marker", c_char_p), + ("media_marker", c_char_p), + ] + +class mtmd_input_text(Structure): + _fields_ = [ + ("text", c_char_p), + ("add_special", c_bool), + ("parse_special", c_bool), + ] + +################################################ +# mtmd.h functions +################################################ + +# MTMD_API const char * mtmd_default_marker(void); +@ctypes_function("mtmd_default_marker", [], c_char_p) +def mtmd_default_marker() -> bytes: + ... + +# MTMD_API struct mtmd_context_params mtmd_context_params_default(void); +@ctypes_function("mtmd_context_params_default", [], mtmd_context_params) +def mtmd_context_params_default() -> mtmd_context_params: + ... + +# MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, +# const struct llama_model * text_model, +# const struct mtmd_context_params ctx_params); +@ctypes_function( + "mtmd_init_from_file", + [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params], + mtmd_context_p_ctypes +) +def mtmd_init_from_file( + mmproj_fname: bytes, + text_model: llama_cpp.llama_model_p, + ctx_params: mtmd_context_params, + /, +) -> Optional[mtmd_context_p]: + ... + +# MTMD_API void mtmd_free(mtmd_context * ctx); +@ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None) +def mtmd_free(ctx: mtmd_context_p, /): + ... + +# MTMD_API bool mtmd_support_vision(mtmd_context * ctx); +@ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) +def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: + ... + +# MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data); +@ctypes_function( + "mtmd_bitmap_init", + [c_uint32, c_uint32, POINTER(c_uint8)], + mtmd_bitmap_p_ctypes +) +def mtmd_bitmap_init( + nx: Union[c_uint32, int], + ny: Union[c_uint32, int], + data: CtypesArray[c_uint8], + /, +) -> Optional[mtmd_bitmap_p]: + ... + +# MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None) +def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): + ... + +# MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); +@ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes) +def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: + ... + +# MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); +@ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None) +def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): + ... + +# MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); +@ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t) +def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: + ... + +# MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx); +@ctypes_function( + "mtmd_input_chunks_get", + [mtmd_input_chunks_p_ctypes, c_size_t], + mtmd_input_chunk_p_ctypes +) +def mtmd_input_chunks_get( + chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], / +) -> Optional[mtmd_input_chunk_p]: + ... + +# MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, +# mtmd_input_chunks * output, +# const mtmd_input_text * text, +# const mtmd_bitmap ** bitmaps, +# size_t n_bitmaps); +@ctypes_function( + "mtmd_tokenize", + [ + mtmd_context_p_ctypes, + mtmd_input_chunks_p_ctypes, + POINTER(mtmd_input_text), + POINTER(mtmd_bitmap_p_ctypes), + c_size_t, + ], + c_int, +) +def mtmd_tokenize( + ctx: mtmd_context_p, + output: mtmd_input_chunks_p, + text: "_Pointer[mtmd_input_text]", + bitmaps: CtypesArray[mtmd_bitmap_p_ctypes], + n_bitmaps: Union[c_size_t, int], + /, +) -> int: + ... + +# MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk); +@ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) +def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: + ... + +# MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk); +@ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int) +def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: + ... + +# MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output); +@ctypes_function( + "mtmd_input_chunk_get_tokens_text", + [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)], + POINTER(llama_cpp.llama_token) +) +def mtmd_input_chunk_get_tokens_text( + chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", / +) -> Optional["_Pointer[llama_cpp.llama_token]"]: + ... + +################################################ +# mtmd-helper.h functions +################################################ + +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +@ctypes_function( + "mtmd_helper_bitmap_init_from_buf", + [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t], + mtmd_bitmap_p_ctypes +) +def mtmd_helper_bitmap_init_from_buf( + ctx: mtmd_context_p, + buf: CtypesArray[c_uint8], + length: Union[c_size_t, int], + /, +) -> Optional[mtmd_bitmap_p]: + ... + +# MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); +@ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: + ... + +# MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, +# struct llama_context * lctx, +# const mtmd_input_chunk * chunk, +# llama_pos n_past, +# llama_seq_id seq_id, +# int32_t n_batch, +# bool logits_last, +# llama_pos * new_n_past); +@ctypes_function( + "mtmd_helper_eval_chunk_single", + [ + mtmd_context_p_ctypes, + llama_cpp.llama_context_p_ctypes, + mtmd_input_chunk_p_ctypes, + llama_cpp.llama_pos, + llama_cpp.llama_seq_id, + c_int, + c_bool, + POINTER(llama_cpp.llama_pos), + ], + c_int, +) +def mtmd_helper_eval_chunk_single( + ctx: mtmd_context_p, + lctx: llama_cpp.llama_context_p, + chunk: mtmd_input_chunk_p, + n_past: llama_cpp.llama_pos, + seq_id: llama_cpp.llama_seq_id, + n_batch: Union[c_int, int], + logits_last: Union[c_bool, bool], + new_n_past: "_Pointer[llama_cpp.llama_pos]", + /, +) -> int: + ... diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index c6716f919..11bd363b5 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == "qwen2.5-vl": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 794fe23f2..8846aace4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 794fe23f29fb40104975c91fe19f23798f7c726e +Subproject commit 8846aace4934ad29651ea61b8c7e3f6b0556e3d2