diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f402fa2..96c62950 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,7 @@ endif()
# include jni.h and jni_md.h
if(NOT DEFINED JNI_INCLUDE_DIRS)
- if(OS_NAME MATCHES "^Linux" OR OS_NAME STREQUAL "Mac")
+ if(OS_NAME MATCHES "^Linux" OR OS_NAME STREQUAL "Mac" OR OS_NAME STREQUAL "Darwin")
set(JNI_INCLUDE_DIRS .github/include/unix)
elseif(OS_NAME STREQUAL "Windows")
set(JNI_INCLUDE_DIRS .github/include/windows)
diff --git a/pom.xml b/pom.xml
index 3916a9e7..67b366ee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
de.kherud
llama
- 4.1.0
+ 4.2.0
jar
${project.groupId}:${project.artifactId}
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index ac056b94..11c80ae0 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -452,16 +452,6 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
llama_init_dft.context.reset();
}
- ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, params.chat_template);
- try {
- common_chat_format_example(ctx_server->chat_templates.get(), params.use_jinja);
- } catch (const std::exception &e) {
- SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This "
- "may cause the model to output suboptimal responses\n",
- __func__);
- ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, "chatml");
- }
-
// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
common_chat_templates_source(ctx_server->chat_templates.get()),
@@ -860,4 +850,4 @@ JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_jsonSchemaToGrammar
nlohmann::ordered_json c_schema_json = nlohmann::ordered_json::parse(c_schema);
const std::string c_grammar = json_schema_to_grammar(c_schema_json);
return parse_jbytes(env, c_grammar);
-}
\ No newline at end of file
+}
diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp
index 66169a83..9686f2af 100644
--- a/src/main/cpp/server.hpp
+++ b/src/main/cpp/server.hpp
@@ -3269,151 +3269,3 @@ struct server_context {
};
}
};
-
-static void common_params_handle_model_default(std::string &model, const std::string &model_url, std::string &hf_repo,
- std::string &hf_file, const std::string &hf_token) {
- if (!hf_repo.empty()) {
- // short-hand to avoid specifying --hf-file -> default it to --model
- if (hf_file.empty()) {
- if (model.empty()) {
- auto auto_detected = common_get_hf_file(hf_repo, hf_token);
- if (auto_detected.first.empty() || auto_detected.second.empty()) {
- exit(1); // built without CURL, error message already printed
- }
- hf_repo = auto_detected.first;
- hf_file = auto_detected.second;
- } else {
- hf_file = model;
- }
- }
- // make sure model path is present (for caching purposes)
- if (model.empty()) {
- // this is to avoid different repo having same file name, or same file name in different subdirs
- std::string filename = hf_repo + "_" + hf_file;
- // to make sure we don't have any slashes in the filename
- string_replace_all(filename, "/", "_");
- model = fs_get_cache_file(filename);
- }
- } else if (!model_url.empty()) {
- if (model.empty()) {
- auto f = string_split(model_url, '#').front();
- f = string_split(f, '?').front();
- model = fs_get_cache_file(string_split(f, '/').back());
- }
- } else if (model.empty()) {
- model = DEFAULT_MODEL_PATH;
- }
-}
-
-// parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct.
-static void server_params_parse(json jparams, common_params ¶ms) {
- common_params default_params;
-
- params.sampling.seed = json_value(jparams, "seed", default_params.sampling.seed);
- params.cpuparams.n_threads = json_value(jparams, "n_threads", default_params.cpuparams.n_threads);
- params.speculative.cpuparams.n_threads =
- json_value(jparams, "n_threads_draft", default_params.speculative.cpuparams.n_threads);
- params.cpuparams_batch.n_threads = json_value(jparams, "n_threads_batch", default_params.cpuparams_batch.n_threads);
- params.speculative.cpuparams_batch.n_threads =
- json_value(jparams, "n_threads_batch_draft", default_params.speculative.cpuparams_batch.n_threads);
- params.n_predict = json_value(jparams, "n_predict", default_params.n_predict);
- params.n_ctx = json_value(jparams, "n_ctx", default_params.n_ctx);
- params.n_batch = json_value(jparams, "n_batch", default_params.n_batch);
- params.n_ubatch = json_value(jparams, "n_ubatch", default_params.n_ubatch);
- params.n_keep = json_value(jparams, "n_keep", default_params.n_keep);
-
- params.speculative.n_max = json_value(jparams, "n_draft", default_params.speculative.n_max);
- params.speculative.n_min = json_value(jparams, "n_draft_min", default_params.speculative.n_min);
-
- params.n_chunks = json_value(jparams, "n_chunks", default_params.n_chunks);
- params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel);
- params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences);
- params.speculative.p_split = json_value(jparams, "p_split", default_params.speculative.p_split);
- params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n);
- params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w);
- params.n_print = json_value(jparams, "n_print", default_params.n_print);
- params.rope_freq_base = json_value(jparams, "rope_freq_base", default_params.rope_freq_base);
- params.rope_freq_scale = json_value(jparams, "rope_freq_scale", default_params.rope_freq_scale);
- params.yarn_ext_factor = json_value(jparams, "yarn_ext_factor", default_params.yarn_ext_factor);
- params.yarn_attn_factor = json_value(jparams, "yarn_attn_factor", default_params.yarn_attn_factor);
- params.yarn_beta_fast = json_value(jparams, "yarn_beta_fast", default_params.yarn_beta_fast);
- params.yarn_beta_slow = json_value(jparams, "yarn_beta_slow", default_params.yarn_beta_slow);
- params.yarn_orig_ctx = json_value(jparams, "yarn_orig_ctx", default_params.yarn_orig_ctx);
- params.defrag_thold = json_value(jparams, "defrag_thold", default_params.defrag_thold);
- params.numa = json_value(jparams, "numa", default_params.numa);
- params.rope_scaling_type = json_value(jparams, "rope_scaling_type", default_params.rope_scaling_type);
- params.pooling_type = json_value(jparams, "pooling_type", default_params.pooling_type);
- params.model = json_value(jparams, "model", default_params.model);
- params.speculative.model = json_value(jparams, "model_draft", default_params.speculative.model);
- params.model_alias = json_value(jparams, "model_alias", default_params.model_alias);
- params.model_url = json_value(jparams, "model_url", default_params.model_url);
- params.hf_repo = json_value(jparams, "hf_repo", default_params.hf_repo);
- params.hf_file = json_value(jparams, "hf_file", default_params.hf_file);
- params.prompt = json_value(jparams, "prompt", default_params.prompt);
- params.prompt_file = json_value(jparams, "prompt_file", default_params.prompt_file);
- params.path_prompt_cache = json_value(jparams, "path_prompt_cache", default_params.path_prompt_cache);
- params.input_prefix = json_value(jparams, "input_prefix", default_params.input_prefix);
- params.input_suffix = json_value(jparams, "input_suffix", default_params.input_suffix);
- params.antiprompt = json_value(jparams, "antiprompt", default_params.antiprompt);
- params.lookup_cache_static = json_value(jparams, "lookup_cache_static", default_params.lookup_cache_static);
- params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic);
- params.logits_file = json_value(jparams, "logits_file", default_params.logits_file);
- // params.lora_adapters = json_value(jparams, "lora_adapter", default_params.lora_adapters);
- params.embedding = json_value(jparams, "embedding", default_params.embedding);
- params.escape = json_value(jparams, "escape", default_params.escape);
- params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching);
- params.flash_attn = json_value(jparams, "flash_attn", default_params.flash_attn);
- params.input_prefix_bos = json_value(jparams, "input_prefix_bos", default_params.input_prefix_bos);
- params.sampling.ignore_eos = json_value(jparams, "ignore_eos", default_params.sampling.ignore_eos);
- params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap);
- params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock);
- params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload);
- params.chat_template = json_value(jparams, "chat_template", default_params.chat_template);
-
- if (jparams.contains("n_gpu_layers")) {
- if (llama_supports_gpu_offload()) {
- params.n_gpu_layers = json_value(jparams, "n_gpu_layers", default_params.n_gpu_layers);
- params.speculative.n_gpu_layers =
- json_value(jparams, "n_gpu_layers_draft", default_params.speculative.n_gpu_layers);
- } else {
- SRV_WRN("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
- "See main README.md for information on enabling GPU BLAS support: %s = %d",
- "n_gpu_layers", params.n_gpu_layers);
- }
- }
-
- if (jparams.contains("split_mode")) {
- params.split_mode = json_value(jparams, "split_mode", default_params.split_mode);
-// todo: the definition checks here currently don't work due to cmake visibility reasons
-#ifndef GGML_USE_CUDA
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif
- }
-
- if (jparams.contains("tensor_split")) {
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- std::vector tensor_split = jparams["tensor_split"].get>();
- GGML_ASSERT(tensor_split.size() <= llama_max_devices());
-
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
- if (i_device < tensor_split.size()) {
- params.tensor_split[i_device] = tensor_split.at(i_device);
- } else {
- params.tensor_split[i_device] = 0.0f;
- }
- }
-#else
- SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUDA
- }
-
- if (jparams.contains("main_gpu")) {
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- params.main_gpu = json_value(jparams, "main_gpu", default_params.main_gpu);
-#else
- SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a main GPU.");
-#endif
- }
-
- common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
-}
diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java
index e4947d4e..7999295d 100644
--- a/src/main/java/de/kherud/llama/ModelParameters.java
+++ b/src/main/java/de/kherud/llama/ModelParameters.java
@@ -459,7 +459,7 @@ public ModelParameters setJsonSchema(String schema) {
* Set pooling type for embeddings (default: model default if unspecified).
*/
public ModelParameters setPoolingType(PoolingType type) {
- parameters.put("--pooling", String.valueOf(type.getId()));
+ parameters.put("--pooling", type.getArgValue());
return this;
}
@@ -467,7 +467,7 @@ public ModelParameters setPoolingType(PoolingType type) {
* Set RoPE frequency scaling method (default: linear unless specified by the model).
*/
public ModelParameters setRopeScaling(RopeScalingType type) {
- parameters.put("--rope-scaling", String.valueOf(type.getId()));
+ parameters.put("--rope-scaling", type.getArgValue());
return this;
}
@@ -960,3 +960,5 @@ public ModelParameters enableJinja() {
}
}
+
+
diff --git a/src/main/java/de/kherud/llama/OSInfo.java b/src/main/java/de/kherud/llama/OSInfo.java
index 772aeaef..9354ec2f 100644
--- a/src/main/java/de/kherud/llama/OSInfo.java
+++ b/src/main/java/de/kherud/llama/OSInfo.java
@@ -200,7 +200,7 @@ else if (armType.startsWith("aarch64")) {
}
// Java 1.8 introduces a system property to determine armel or armhf
- // http://bugs.java.com/bugdatabase/view_bug.do?bug_id=8005545
+ // https://bugs.openjdk.org/browse/JDK-8005545
String abi = System.getProperty("sun.arch.abi");
if (abi != null && abi.startsWith("gnueabihf")) {
return "armv7";
diff --git a/src/main/java/de/kherud/llama/args/PoolingType.java b/src/main/java/de/kherud/llama/args/PoolingType.java
index a9c9dbae..c0379c85 100644
--- a/src/main/java/de/kherud/llama/args/PoolingType.java
+++ b/src/main/java/de/kherud/llama/args/PoolingType.java
@@ -2,20 +2,20 @@
public enum PoolingType {
- UNSPECIFIED(-1),
- NONE(0),
- MEAN(1),
- CLS(2),
- LAST(3),
- RANK(4);
+ UNSPECIFIED("unspecified"),
+ NONE("none"),
+ MEAN("mean"),
+ CLS("cls"),
+ LAST("last"),
+ RANK("rank");
- private final int id;
+ private final String argValue;
- PoolingType(int value) {
- this.id = value;
+ PoolingType(String value) {
+ this.argValue = value;
}
- public int getId() {
- return id;
+ public String getArgValue() {
+ return argValue;
}
-}
+}
\ No newline at end of file
diff --git a/src/main/java/de/kherud/llama/args/RopeScalingType.java b/src/main/java/de/kherud/llama/args/RopeScalingType.java
index eed939a1..138d05be 100644
--- a/src/main/java/de/kherud/llama/args/RopeScalingType.java
+++ b/src/main/java/de/kherud/llama/args/RopeScalingType.java
@@ -2,20 +2,20 @@
public enum RopeScalingType {
- UNSPECIFIED(-1),
- NONE(0),
- LINEAR(1),
- YARN2(2),
- LONGROPE(3),
- MAX_VALUE(3);
+ UNSPECIFIED("unspecified"),
+ NONE("none"),
+ LINEAR("linear"),
+ YARN2("yarn"),
+ LONGROPE("longrope"),
+ MAX_VALUE("maxvalue");
- private final int id;
+ private final String argValue;
- RopeScalingType(int value) {
- this.id = value;
+ RopeScalingType(String value) {
+ this.argValue = value;
}
- public int getId() {
- return id;
+ public String getArgValue() {
+ return argValue;
}
-}
+}
\ No newline at end of file