Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[pull] main from abetlen:main #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.9]

- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c

## [0.3.8]

- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698
Expand Down
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ if (LLAMA_BUILD)
# Enable building of the common library
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)

# Disable building curl support
set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)

# Architecture detection and settings for Apple platforms
if (APPLE)
# Get the target architecture
Expand Down Expand Up @@ -143,7 +146,7 @@ if (LLAMA_BUILD)
endif()

# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
add_subdirectory(vendor/llama.cpp/tools/mtmd)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")

if (WIN32)
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.8"
__version__ = "0.3.9"
33 changes: 21 additions & 12 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@
# LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
# LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
# LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
# LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
# };
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
Expand All @@ -252,7 +254,7 @@
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
LLAMA_VOCAB_PRE_TYPE_PORO = 15
LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
LLAMA_VOCAB_PRE_TYPE_VIKING = 18
LLAMA_VOCAB_PRE_TYPE_JAIS = 19
Expand All @@ -269,6 +271,8 @@
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34


# // note: these values should be synchronized with ggml_rope
Expand Down Expand Up @@ -891,17 +895,18 @@ class llama_context_params(ctypes.Structure):

# // model quantization parameters
# typedef struct llama_model_quantize_params {
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype
# enum ggml_type output_tensor_type; // output tensor type
# enum ggml_type token_embedding_type; // token embeddings tensor type
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type
# bool keep_split; // quantize to the same number of shards
# void * imatrix; // pointer to importance matrix data
# void * kv_overrides; // pointer to vector containing overrides
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype
# enum ggml_type output_tensor_type; // output tensor type
# enum ggml_type token_embedding_type; // token embeddings tensor type
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type
# bool keep_split; // quantize to the same number of shards
# void * imatrix; // pointer to importance matrix data
# void * kv_overrides; // pointer to vector containing overrides
# void * tensor_types; // pointer to vector containing tensor types
# } llama_model_quantize_params;
class llama_model_quantize_params(ctypes.Structure):
"""Parameters for llama_model_quantize
Expand All @@ -918,6 +923,7 @@ class llama_model_quantize_params(ctypes.Structure):
keep_split (bool): quantize to the same number of shards
imatrix (ctypes.c_void_p): pointer to importance matrix data
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
"""

if TYPE_CHECKING:
Expand All @@ -932,6 +938,7 @@ class llama_model_quantize_params(ctypes.Structure):
keep_split: bool
imatrix: ctypes.c_void_p
kv_overrides: ctypes.c_void_p
tensor_types: ctypes.c_void_p

_fields_ = [
("nthread", ctypes.c_int32),
Expand All @@ -945,6 +952,7 @@ class llama_model_quantize_params(ctypes.Structure):
("keep_split", ctypes.c_bool),
("imatrix", ctypes.c_void_p),
("kv_overrides", ctypes.c_void_p),
("tensor_types", ctypes.c_void_p),
]


Expand Down Expand Up @@ -3812,6 +3820,7 @@ def llama_sampler_init_softmax() -> llama_sampler_p:


# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
# /// Setting k <= 0 makes this a noop
# LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp