diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f402fa2..96c62950 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ endif() # include jni.h and jni_md.h if(NOT DEFINED JNI_INCLUDE_DIRS) - if(OS_NAME MATCHES "^Linux" OR OS_NAME STREQUAL "Mac") + if(OS_NAME MATCHES "^Linux" OR OS_NAME STREQUAL "Mac" OR OS_NAME STREQUAL "Darwin") set(JNI_INCLUDE_DIRS .github/include/unix) elseif(OS_NAME STREQUAL "Windows") set(JNI_INCLUDE_DIRS .github/include/windows) diff --git a/pom.xml b/pom.xml index 3916a9e7..67b366ee 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ de.kherud llama - 4.1.0 + 4.2.0 jar ${project.groupId}:${project.artifactId} diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index ac056b94..11c80ae0 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -452,16 +452,6 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo llama_init_dft.context.reset(); } - ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, params.chat_template); - try { - common_chat_format_example(ctx_server->chat_templates.get(), params.use_jinja); - } catch (const std::exception &e) { - SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This " - "may cause the model to output suboptimal responses\n", - __func__); - ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, "chatml"); - } - // print sample chat example to make it clear which template is used LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, common_chat_templates_source(ctx_server->chat_templates.get()), @@ -860,4 +850,4 @@ JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_jsonSchemaToGrammar nlohmann::ordered_json c_schema_json = nlohmann::ordered_json::parse(c_schema); const std::string c_grammar = json_schema_to_grammar(c_schema_json); return parse_jbytes(env, c_grammar); -} \ No newline at end of file +} diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index 66169a83..9686f2af 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -3269,151 +3269,3 @@ struct server_context { }; } }; - -static void common_params_handle_model_default(std::string &model, const std::string &model_url, std::string &hf_repo, - std::string &hf_file, const std::string &hf_token) { - if (!hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (hf_file.empty()) { - if (model.empty()) { - auto auto_detected = common_get_hf_file(hf_repo, hf_token); - if (auto_detected.first.empty() || auto_detected.second.empty()) { - exit(1); // built without CURL, error message already printed - } - hf_repo = auto_detected.first; - hf_file = auto_detected.second; - } else { - hf_file = model; - } - } - // make sure model path is present (for caching purposes) - if (model.empty()) { - // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = hf_repo + "_" + hf_file; - // to make sure we don't have any slashes in the filename - string_replace_all(filename, "/", "_"); - model = fs_get_cache_file(filename); - } - } else if (!model_url.empty()) { - if (model.empty()) { - auto f = string_split(model_url, '#').front(); - f = string_split(f, '?').front(); - model = fs_get_cache_file(string_split(f, '/').back()); - } - } else if (model.empty()) { - model = DEFAULT_MODEL_PATH; - } -} - -// parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct. -static void server_params_parse(json jparams, common_params ¶ms) { - common_params default_params; - - params.sampling.seed = json_value(jparams, "seed", default_params.sampling.seed); - params.cpuparams.n_threads = json_value(jparams, "n_threads", default_params.cpuparams.n_threads); - params.speculative.cpuparams.n_threads = - json_value(jparams, "n_threads_draft", default_params.speculative.cpuparams.n_threads); - params.cpuparams_batch.n_threads = json_value(jparams, "n_threads_batch", default_params.cpuparams_batch.n_threads); - params.speculative.cpuparams_batch.n_threads = - json_value(jparams, "n_threads_batch_draft", default_params.speculative.cpuparams_batch.n_threads); - params.n_predict = json_value(jparams, "n_predict", default_params.n_predict); - params.n_ctx = json_value(jparams, "n_ctx", default_params.n_ctx); - params.n_batch = json_value(jparams, "n_batch", default_params.n_batch); - params.n_ubatch = json_value(jparams, "n_ubatch", default_params.n_ubatch); - params.n_keep = json_value(jparams, "n_keep", default_params.n_keep); - - params.speculative.n_max = json_value(jparams, "n_draft", default_params.speculative.n_max); - params.speculative.n_min = json_value(jparams, "n_draft_min", default_params.speculative.n_min); - - params.n_chunks = json_value(jparams, "n_chunks", default_params.n_chunks); - params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel); - params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences); - params.speculative.p_split = json_value(jparams, "p_split", default_params.speculative.p_split); - params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n); - params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w); - params.n_print = json_value(jparams, "n_print", default_params.n_print); - params.rope_freq_base = json_value(jparams, "rope_freq_base", default_params.rope_freq_base); - params.rope_freq_scale = json_value(jparams, "rope_freq_scale", default_params.rope_freq_scale); - params.yarn_ext_factor = json_value(jparams, "yarn_ext_factor", default_params.yarn_ext_factor); - params.yarn_attn_factor = json_value(jparams, "yarn_attn_factor", default_params.yarn_attn_factor); - params.yarn_beta_fast = json_value(jparams, "yarn_beta_fast", default_params.yarn_beta_fast); - params.yarn_beta_slow = json_value(jparams, "yarn_beta_slow", default_params.yarn_beta_slow); - params.yarn_orig_ctx = json_value(jparams, "yarn_orig_ctx", default_params.yarn_orig_ctx); - params.defrag_thold = json_value(jparams, "defrag_thold", default_params.defrag_thold); - params.numa = json_value(jparams, "numa", default_params.numa); - params.rope_scaling_type = json_value(jparams, "rope_scaling_type", default_params.rope_scaling_type); - params.pooling_type = json_value(jparams, "pooling_type", default_params.pooling_type); - params.model = json_value(jparams, "model", default_params.model); - params.speculative.model = json_value(jparams, "model_draft", default_params.speculative.model); - params.model_alias = json_value(jparams, "model_alias", default_params.model_alias); - params.model_url = json_value(jparams, "model_url", default_params.model_url); - params.hf_repo = json_value(jparams, "hf_repo", default_params.hf_repo); - params.hf_file = json_value(jparams, "hf_file", default_params.hf_file); - params.prompt = json_value(jparams, "prompt", default_params.prompt); - params.prompt_file = json_value(jparams, "prompt_file", default_params.prompt_file); - params.path_prompt_cache = json_value(jparams, "path_prompt_cache", default_params.path_prompt_cache); - params.input_prefix = json_value(jparams, "input_prefix", default_params.input_prefix); - params.input_suffix = json_value(jparams, "input_suffix", default_params.input_suffix); - params.antiprompt = json_value(jparams, "antiprompt", default_params.antiprompt); - params.lookup_cache_static = json_value(jparams, "lookup_cache_static", default_params.lookup_cache_static); - params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic); - params.logits_file = json_value(jparams, "logits_file", default_params.logits_file); - // params.lora_adapters = json_value(jparams, "lora_adapter", default_params.lora_adapters); - params.embedding = json_value(jparams, "embedding", default_params.embedding); - params.escape = json_value(jparams, "escape", default_params.escape); - params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching); - params.flash_attn = json_value(jparams, "flash_attn", default_params.flash_attn); - params.input_prefix_bos = json_value(jparams, "input_prefix_bos", default_params.input_prefix_bos); - params.sampling.ignore_eos = json_value(jparams, "ignore_eos", default_params.sampling.ignore_eos); - params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap); - params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock); - params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload); - params.chat_template = json_value(jparams, "chat_template", default_params.chat_template); - - if (jparams.contains("n_gpu_layers")) { - if (llama_supports_gpu_offload()) { - params.n_gpu_layers = json_value(jparams, "n_gpu_layers", default_params.n_gpu_layers); - params.speculative.n_gpu_layers = - json_value(jparams, "n_gpu_layers_draft", default_params.speculative.n_gpu_layers); - } else { - SRV_WRN("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " - "See main README.md for information on enabling GPU BLAS support: %s = %d", - "n_gpu_layers", params.n_gpu_layers); - } - } - - if (jparams.contains("split_mode")) { - params.split_mode = json_value(jparams, "split_mode", default_params.split_mode); -// todo: the definition checks here currently don't work due to cmake visibility reasons -#ifndef GGML_USE_CUDA - fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n"); -#endif - } - - if (jparams.contains("tensor_split")) { -#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) - std::vector tensor_split = jparams["tensor_split"].get>(); - GGML_ASSERT(tensor_split.size() <= llama_max_devices()); - - for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { - if (i_device < tensor_split.size()) { - params.tensor_split[i_device] = tensor_split.at(i_device); - } else { - params.tensor_split[i_device] = 0.0f; - } - } -#else - SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUDA - } - - if (jparams.contains("main_gpu")) { -#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) - params.main_gpu = json_value(jparams, "main_gpu", default_params.main_gpu); -#else - SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a main GPU."); -#endif - } - - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token); -} diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java index e4947d4e..7999295d 100644 --- a/src/main/java/de/kherud/llama/ModelParameters.java +++ b/src/main/java/de/kherud/llama/ModelParameters.java @@ -459,7 +459,7 @@ public ModelParameters setJsonSchema(String schema) { * Set pooling type for embeddings (default: model default if unspecified). */ public ModelParameters setPoolingType(PoolingType type) { - parameters.put("--pooling", String.valueOf(type.getId())); + parameters.put("--pooling", type.getArgValue()); return this; } @@ -467,7 +467,7 @@ public ModelParameters setPoolingType(PoolingType type) { * Set RoPE frequency scaling method (default: linear unless specified by the model). */ public ModelParameters setRopeScaling(RopeScalingType type) { - parameters.put("--rope-scaling", String.valueOf(type.getId())); + parameters.put("--rope-scaling", type.getArgValue()); return this; } @@ -960,3 +960,5 @@ public ModelParameters enableJinja() { } } + + diff --git a/src/main/java/de/kherud/llama/OSInfo.java b/src/main/java/de/kherud/llama/OSInfo.java index 772aeaef..9354ec2f 100644 --- a/src/main/java/de/kherud/llama/OSInfo.java +++ b/src/main/java/de/kherud/llama/OSInfo.java @@ -200,7 +200,7 @@ else if (armType.startsWith("aarch64")) { } // Java 1.8 introduces a system property to determine armel or armhf - // http://bugs.java.com/bugdatabase/view_bug.do?bug_id=8005545 + // https://bugs.openjdk.org/browse/JDK-8005545 String abi = System.getProperty("sun.arch.abi"); if (abi != null && abi.startsWith("gnueabihf")) { return "armv7"; diff --git a/src/main/java/de/kherud/llama/args/PoolingType.java b/src/main/java/de/kherud/llama/args/PoolingType.java index a9c9dbae..c0379c85 100644 --- a/src/main/java/de/kherud/llama/args/PoolingType.java +++ b/src/main/java/de/kherud/llama/args/PoolingType.java @@ -2,20 +2,20 @@ public enum PoolingType { - UNSPECIFIED(-1), - NONE(0), - MEAN(1), - CLS(2), - LAST(3), - RANK(4); + UNSPECIFIED("unspecified"), + NONE("none"), + MEAN("mean"), + CLS("cls"), + LAST("last"), + RANK("rank"); - private final int id; + private final String argValue; - PoolingType(int value) { - this.id = value; + PoolingType(String value) { + this.argValue = value; } - public int getId() { - return id; + public String getArgValue() { + return argValue; } -} +} \ No newline at end of file diff --git a/src/main/java/de/kherud/llama/args/RopeScalingType.java b/src/main/java/de/kherud/llama/args/RopeScalingType.java index eed939a1..138d05be 100644 --- a/src/main/java/de/kherud/llama/args/RopeScalingType.java +++ b/src/main/java/de/kherud/llama/args/RopeScalingType.java @@ -2,20 +2,20 @@ public enum RopeScalingType { - UNSPECIFIED(-1), - NONE(0), - LINEAR(1), - YARN2(2), - LONGROPE(3), - MAX_VALUE(3); + UNSPECIFIED("unspecified"), + NONE("none"), + LINEAR("linear"), + YARN2("yarn"), + LONGROPE("longrope"), + MAX_VALUE("maxvalue"); - private final int id; + private final String argValue; - RopeScalingType(int value) { - this.id = value; + RopeScalingType(String value) { + this.argValue = value; } - public int getId() { - return id; + public String getArgValue() { + return argValue; } -} +} \ No newline at end of file