From 95559fa3d208f474bdc2055c34826f2af449c9c9 Mon Sep 17 00:00:00 2001 From: zdl010 Date: Sat, 29 Jun 2024 17:19:43 +0800 Subject: [PATCH 01/24] Upgrade llama.cpp to b3265, support gemma2, remove beam parameter[ https://github.com/ggerganov/llama.cpp/pull/7985 ] --- CMakeLists.txt | 2 +- pom.xml | 2 +- src/main/cpp/server.hpp | 1 - src/main/java/de/kherud/llama/ModelParameters.java | 9 --------- 4 files changed, 2 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 550759f2..a7c2c4e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ FetchContent_MakeAvailable(json) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3008 + GIT_TAG b3265 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/pom.xml b/pom.xml index 79f10350..95bf822b 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ de.kherud llama - 3.2.1 + 3.2.2 jar ${project.groupId}:${project.artifactId} diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index d3d4750a..5b9064de 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -2551,7 +2551,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel); params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences); params.p_split = json_value(jparams, "p_split", default_params.p_split); - params.n_beams = json_value(jparams, "n_beams", default_params.n_beams); params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n); params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w); params.n_print = json_value(jparams, "n_print", default_params.n_print); diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java index 1cbb6973..98342d37 100644 --- a/src/main/java/de/kherud/llama/ModelParameters.java +++ b/src/main/java/de/kherud/llama/ModelParameters.java @@ -32,7 +32,6 @@ public final class ModelParameters extends JsonParameters { private static final String PARAM_SPLIT_MODE = "split_mode"; private static final String PARAM_MAIN_GPU = "main_gpu"; private static final String PARAM_TENSOR_SPLIT = "tensor_split"; - private static final String PARAM_N_BEAMS = "n_beams"; private static final String PARAM_GRP_ATTN_N = "grp_attn_n"; private static final String PARAM_GRP_ATTN_W = "grp_attn_w"; private static final String PARAM_ROPE_FREQ_BASE = "rope_freq_base"; @@ -244,14 +243,6 @@ public ModelParameters setTensorSplit(float[] tensorSplit) { return this; } - /** - * Set usage of beam search of given width if non-zero. - */ - public ModelParameters setNBeams(int nBeams) { - parameters.put(PARAM_N_BEAMS, String.valueOf(nBeams)); - return this; - } - /** * Set the group-attention factor (default: 1) */ From bb3570d5e8787eca6d228bd8e20439518a6f1586 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Sun, 30 Jun 2024 21:49:14 +0200 Subject: [PATCH 02/24] Start updating server code to b3265 --- src/main/cpp/jllama.cpp | 44 +++++---- src/main/cpp/server.hpp | 205 +++++++++++++++++++++++++++++++++------- src/main/cpp/utils.hpp | 59 ++++-------- 3 files changed, 212 insertions(+), 96 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 2298c190..251b4940 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -355,13 +355,12 @@ JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *vm, void *reserved) JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jobject obj, jstring jparams) { gpt_params params; - server_params sparams; auto *ctx_server = new server_context(); std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); - server_params_parse(json_params, sparams, params); + server_params_parse(json_params, params); if (json_value(json_params, "disable_log", false)) { @@ -372,9 +371,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo log_enable(); } - if (!sparams.system_prompt.empty()) + if (!params.system_prompt.empty()) { - ctx_server->system_prompt_set(sparams.system_prompt); + ctx_server->system_prompt_set(params.system_prompt); } if (params.model_alias == "unknown") @@ -395,6 +394,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo std::atomic state{SERVER_STATE_LOADING_MODEL}; + // Necessary similarity of prompt for slot selection + ctx_server->slot_prompt_similarity = params.slot_prompt_similarity; + // load the model if (!ctx_server->load_model(params)) { @@ -411,32 +413,36 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo const auto model_meta = ctx_server->model_meta(); // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (sparams.chat_template.empty()) + if (params.chat_template.empty()) { if (!ctx_server->validate_model_chat_template()) { LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This " "may cause the model to output suboptimal responses", {}); - sparams.chat_template = "chatml"; + params.chat_template = "chatml"; } } - ctx_server->chat_template = sparams.chat_template; - // print sample chat example to make it clear which template is used + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { - json chat; - chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}}); - chat.push_back({{"role", "user"}, {"content", "Hello"}}); - chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); - chat.push_back({{"role", "user"}, {"content", "How are you?"}}); - - const std::string chat_example = format_chat(ctx_server->model, sparams.chat_template, chat); + if (!ctx_server->validate_model_chat_template()) + { + LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This " + "may cause the model to output suboptimal responses", + {}); + params.chat_template = "chatml"; + } + } - LOG_INFO("chat template", { - {"chat_example", chat_example}, - {"built_in", sparams.chat_template.empty()}, - }); + // print sample chat example to make it clear which template is used + { + LOG_INFO("chat template", + { + {"chat_example", llama_chat_format_example(ctx_server->model, params.chat_template)}, + {"built_in", params.chat_template.empty()}, + }); } ctx_server->queue_tasks.on_new_task( diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index 5b9064de..3b362371 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -103,12 +103,6 @@ struct slot_params json input_suffix; }; -struct server_params -{ - std::string chat_template = ""; - std::string system_prompt = ""; -}; - struct server_slot { int id; @@ -700,6 +694,9 @@ struct server_context server_metrics metrics; + // Necessary similarity of prompt for slot selection + float slot_prompt_similarity = 0.0f; + ~server_context() { if (ctx) @@ -866,28 +863,103 @@ struct server_context return prompt_tokens; } - server_slot *get_slot(int id) + server_slot *get_slot_by_id(int id) { - int64_t t_last = ggml_time_us(); - - server_slot *last_used = nullptr; - for (server_slot &slot : slots) { - if (slot.id == id && slot.available()) + if (slot.id == id) { return &slot; } + } + + return nullptr; + } + + server_slot *get_available_slot(const std::string &prompt) + { + server_slot *ret = nullptr; - // among all available slots, find the one that has been least recently used - if (slot.available() && slot.t_last_used < t_last) + // find the slot that has at least n% prompt similarity + if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) + { + int max_lcp_len = 0; + float similarity = 0; + + for (server_slot &slot : slots) { - last_used = &slot; - t_last = slot.t_last_used; + // skip the slot if it is not available + if (!slot.available()) + { + continue; + } + + // skip the slot if it does not contains prompt + if (!slot.prompt.is_string()) + { + continue; + } + + // current slot's prompt + std::string slot_prompt = slot.prompt.get(); + + // length of the current slot's prompt + int slot_prompt_len = slot_prompt.size(); + + // length of the Longest Common Prefix between the current slot's prompt and the input prompt + int lcp_len = common_part(slot_prompt, prompt); + + // fraction of the common substring length compared to the current slot's prompt length + similarity = static_cast(lcp_len) / slot_prompt_len; + + // select the current slot if the criteria match + if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) + { + max_lcp_len = lcp_len; + ret = &slot; + } + } + + if (ret != nullptr) + { + LOG_VERBOSE("selected slot by lcp similarity", { + {"id_slot", ret->id}, + {"max_lcp_len", max_lcp_len}, + {"similarity", similarity}, + }); } } - return last_used; + // find the slot that has been least recently used + if (ret == nullptr) + { + int64_t t_last = ggml_time_us(); + for (server_slot &slot : slots) + { + // skip the slot if it is not available + if (!slot.available()) + { + continue; + } + + // select the current slot if the criteria match + if (slot.t_last_used < t_last) + { + t_last = slot.t_last_used; + ret = &slot; + } + } + + if (ret != nullptr) + { + LOG_VERBOSE("selected slot by lru", { + {"id_slot", ret->id}, + {"t_last", t_last}, + }); + } + } + + return ret; } bool launch_slot_with_task(server_slot &slot, const server_task &task) @@ -947,19 +1019,23 @@ struct server_context slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); // get prompt + if (!task.infill) { const auto &prompt = data.find("prompt"); if (prompt == data.end()) { - send_error(task, R"(Either "prompt" or "messages" must be provided)", ERROR_TYPE_INVALID_REQUEST); + send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); return false; } - slot.prompt = *prompt; - - if (slot.prompt.is_array() && slot.prompt.empty()) + if ((prompt->is_string()) || (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || + (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) + { + slot.prompt = *prompt; + } + else { - send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST); + send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); return false; } } @@ -1663,7 +1739,25 @@ struct server_context switch (task.type) { case SERVER_TASK_TYPE_COMPLETION: { - server_slot *slot = get_slot(json_value(task.data, "id_slot", -1)); + const int id_slot = json_value(task.data, "id_slot", -1); + + server_slot *slot; + + if (id_slot != -1) + { + slot = get_slot_by_id(id_slot); + } + else + { + std::string prompt; + if (task.data.contains("prompt") && task.data.at("prompt").is_string()) + { + prompt = json_value(task.data, "prompt", std::string()); + } + + slot = get_available_slot(prompt); + } + if (slot == nullptr) { // if no slot is available, we defer this task for processing later @@ -1671,6 +1765,13 @@ struct server_context queue_tasks.defer(task); break; } + if (!slot->available()) + { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } if (task.data.contains("system_prompt")) { @@ -1790,12 +1891,19 @@ struct server_context break; case SERVER_TASK_TYPE_SLOT_SAVE: { int id_slot = task.data.at("id_slot"); - server_slot *slot = get_slot(id_slot); + server_slot *slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) + { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } const size_t token_count = slot->cache_tokens.size(); const int64_t t_start = ggml_time_us(); @@ -1823,12 +1931,19 @@ struct server_context break; case SERVER_TASK_TYPE_SLOT_RESTORE: { int id_slot = task.data.at("id_slot"); - server_slot *slot = get_slot(id_slot); + server_slot *slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) + { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } const int64_t t_start = ggml_time_us(); @@ -1865,12 +1980,19 @@ struct server_context break; case SERVER_TASK_TYPE_SLOT_ERASE: { int id_slot = task.data.at("id_slot"); - server_slot *slot = get_slot(id_slot); + server_slot *slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) + { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } // Erase token cache const size_t n_erased = slot->cache_tokens.size(); @@ -2074,6 +2196,7 @@ struct server_context if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { @@ -2091,11 +2214,23 @@ struct server_context } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); - prompt_tokens = prefix_tokens; + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) + { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) + { + embd_inp.push_back(middle_token); + } + + prompt_tokens = embd_inp; } else { @@ -2138,7 +2273,6 @@ struct server_context slot.state = SLOT_STATE_PROCESSING; slot.command = SLOT_COMMAND_NONE; slot.release(); - slot.print_timings(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; @@ -2531,10 +2665,9 @@ struct server_context }; // parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct. -static void server_params_parse(json jparams, server_params &sparams, gpt_params ¶ms) +static void server_params_parse(json jparams, gpt_params ¶ms) { gpt_params default_params; - server_params default_sparams; params.seed = json_value(jparams, "seed", default_params.seed); params.n_threads = json_value(jparams, "n_threads", default_params.n_threads); @@ -2591,8 +2724,8 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap); params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock); params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload); - sparams.system_prompt = json_value(jparams, "system_prompt", default_sparams.system_prompt); - sparams.chat_template = json_value(jparams, "chat_template", default_sparams.chat_template); + params.system_prompt = json_value(jparams, "system_prompt", default_params.system_prompt); + params.chat_template = json_value(jparams, "chat_template", default_params.chat_template); if (jparams.contains("n_gpu_layers")) { diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp index ad7198c1..361be519 100644 --- a/src/main/cpp/utils.hpp +++ b/src/main/cpp/utils.hpp @@ -97,10 +97,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li json log = json{ {"msg", message}, #if SERVER_VERBOSE - {"ts", time(nullptr)}, - {"level", log_level_to_string(level)}, - {"tid", ss_tid.str()}, - {"function", function}, + {"ts", time(nullptr)}, {"level", log_level_to_string(level)}, {"tid", ss_tid.str()}, {"function", function}, {"line", line}, #endif }; @@ -135,9 +132,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li } #if SERVER_VERBOSE - ss << " | ts " << time(nullptr) - << " | tid " << ss_tid.str() - << " | " << function << " line " << line; + ss << " | ts " << time(nullptr) << " | tid " << ss_tid.str() << " | " << function << " line " << line; #endif const std::string str = ss.str(); @@ -157,50 +152,22 @@ static inline void server_log(ggml_log_level level, const char *function, int li // chat template utils // -// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid -inline bool verify_custom_template(const std::string &tmpl) -{ - llama_chat_message chat[] = {{"user", "test"}}; - int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); - return res >= 0; -} - // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model *model, const std::string &tmpl, const std::vector &messages) { - size_t alloc_size = 0; - // vector holding all allocated string to be passed to llama_chat_apply_template - std::vector str(messages.size() * 2); - std::vector chat(messages.size()); + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto &curr_msg = messages[i]; - str[i * 2 + 0] = json_value(curr_msg, "role", std::string("")); - str[i * 2 + 1] = json_value(curr_msg, "content", std::string("")); - alloc_size += str[i * 2 + 1].length(); - chat[i].role = str[i * 2 + 0].c_str(); - chat[i].content = str[i * 2 + 1].c_str(); - } - - const char *ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); - std::vector buf(alloc_size * 2); - - // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - - // if it turns out that our buffer is too small, we resize it - if ((size_t)res > buf.size()) - { - buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); + std::string role = json_value(curr_msg, "role", std::string("")); + std::string content = json_value(curr_msg, "content", std::string("")); + chat.push_back({role, content}); } - const std::string formatted_chat(buf.data(), res); - + auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return formatted_chat; } @@ -322,6 +289,16 @@ static size_t common_part(const std::vector &a, const std::vector= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); @@ -674,7 +651,7 @@ static json format_embeddings_response_oaicompat(const json &request, const json { json data = json::array(); int i = 0; - for (const auto &elem : embeddings) + for (auto &elem : embeddings) { data.push_back( json{{"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, {"object", "embedding"}}); From 68d78ed58f699a88912419710806f92029eee2bb Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Mon, 15 Jul 2024 20:56:09 +0200 Subject: [PATCH 03/24] fix embedding mode segmentation fault --- src/main/cpp/jllama.cpp | 2 +- src/main/cpp/server.hpp | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 251b4940..d59f3b77 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -486,7 +486,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv json chat; chat.push_back({{"role", "system"}, {"content", ctx_server->system_prompt}}); chat.push_back({{"role", "user"}, {"content", json_params["prompt"]}}); - json_params["prompt"] = format_chat(ctx_server->model, ctx_server->chat_template, chat); + json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat); } const int id_task = ctx_server->queue_tasks.get_new_id(); diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index 3b362371..e635cfc5 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -680,11 +680,6 @@ struct server_context std::string system_prompt; std::vector system_tokens; - std::string name_user; // this should be the antiprompt - std::string name_assistant; - - std::string chat_template; - // slots / clients std::vector slots; json default_generation_settings_for_props; @@ -966,7 +961,7 @@ struct server_context { slot_params default_params; llama_sampling_params default_sparams; - auto &data = task.data; + const auto &data = task.data; slot.oaicompat = false; slot.oaicompat_model = ""; @@ -1622,12 +1617,12 @@ struct server_context } const float *embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == nullptr) + if (embd == NULL) { embd = llama_get_embeddings_ith(ctx, i); } - if (embd == nullptr) + if (embd == NULL) { LOG_ERROR("failed to get embeddings", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}}); @@ -2176,6 +2171,11 @@ struct server_context int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); + // track if this is an embedding or non-embedding batch + // if we've added sampled tokens above, we are in non-embedding mode + // -1: none, 0: non-embedding, 1: embedding + int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; + // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { @@ -2370,6 +2370,17 @@ struct server_context } } + // check that we are in the right batch_type, if not defer the slot + bool slot_type = slot.embedding ? 1 : 0; + if (batch_type == -1) + { + batch_type = slot_type; + } + else if (batch_type != slot_type) + { + continue; + } + // keep only the common part int p0 = (int)system_tokens.size() + slot.n_past; if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) @@ -2478,6 +2489,9 @@ struct server_context {"n_tokens", batch.n_tokens}, }); + // make sure we're in the right embedding mode + llama_set_embeddings(ctx, batch_type == 1); + // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { From d7121c4ed7fdc77d4aa6819d6d3526a92c7b97c2 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Mon, 15 Jul 2024 20:56:23 +0200 Subject: [PATCH 04/24] upgrade to llama.cpp b3398 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7c2c4e1..1ec133ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ FetchContent_MakeAvailable(json) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3265 + GIT_TAG b3398 ) FetchContent_MakeAvailable(llama.cpp) From 95fd5eaa8a4ae384af5f6602a9f202f9687fba16 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Mon, 15 Jul 2024 21:40:30 +0200 Subject: [PATCH 05/24] reduce unit tests context size --- src/test/java/de/kherud/llama/LlamaModelTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java index a5454c59..c7ece673 100644 --- a/src/test/java/de/kherud/llama/LlamaModelTest.java +++ b/src/test/java/de/kherud/llama/LlamaModelTest.java @@ -1,7 +1,6 @@ package de.kherud.llama; import java.io.*; -import java.nio.charset.StandardCharsets; import java.util.*; import java.util.regex.Pattern; @@ -24,6 +23,7 @@ public static void setup() { // LlamaModel.setLogger(LogFormat.TEXT, (level, msg) -> System.out.println(level + ": " + msg)); model = new LlamaModel( new ModelParameters() + .setNCtx(128) .setModelFilePath("models/codellama-7b.Q2_K.gguf") // .setModelUrl("https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf") .setNGpuLayers(43) From 3959e4b0bde844237928cf3ac06f3a8942b8c8bc Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Mon, 5 Aug 2024 21:37:08 +0200 Subject: [PATCH 06/24] update to llama.cpp b3525 --- CMakeLists.txt | 2 +- src/main/cpp/server.hpp | 18 ++++--- src/main/cpp/utils.hpp | 49 ++++++++++++------- .../java/de/kherud/llama/ModelParameters.java | 9 ---- 4 files changed, 43 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ec133ed..e7ce9fc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ FetchContent_MakeAvailable(json) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3398 + GIT_TAG b3525 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index e635cfc5..0601dac4 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -725,7 +725,10 @@ struct server_context // dedicate one sequence to the system prompt params.n_parallel += 1; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { @@ -783,6 +786,8 @@ struct server_context slot.ga_n = ga_n; slot.ga_w = ga_w; + slot.sparams = params.sparams; + slot.reset(); slots.push_back(slot); @@ -960,15 +965,17 @@ struct server_context bool launch_slot_with_task(server_slot &slot, const server_task &task) { slot_params default_params; - llama_sampling_params default_sparams; - const auto &data = task.data; + // Sampling parameter defaults are loaded from the global server context (but individual requests can still + // override them) + llama_sampling_params default_sparams = params.sparams; + auto &data = task.data; slot.oaicompat = false; slot.oaicompat_model = ""; slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); @@ -1286,7 +1293,7 @@ struct server_context bool process_token(completion_token_output &result, server_slot &slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok, false); + const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); slot.sampled = result.tok; // search stop word and delete it @@ -2728,7 +2735,6 @@ static void server_params_parse(json jparams, gpt_params ¶ms) params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic); params.logits_file = json_value(jparams, "logits_file", default_params.logits_file); params.lora_adapter = json_value(jparams, "lora_adapter", default_params.lora_adapter); - params.lora_base = json_value(jparams, "lora_base", default_params.lora_base); params.embedding = json_value(jparams, "embedding", default_params.embedding); params.escape = json_value(jparams, "escape", default_params.escape); params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching); diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp index 361be519..7de7eac4 100644 --- a/src/main/cpp/utils.hpp +++ b/src/main/cpp/utils.hpp @@ -161,8 +161,37 @@ inline std::string format_chat(const struct llama_model *model, const std::strin for (size_t i = 0; i < messages.size(); ++i) { const auto &curr_msg = messages[i]; + std::string role = json_value(curr_msg, "role", std::string("")); - std::string content = json_value(curr_msg, "content", std::string("")); + + std::string content; + if (curr_msg.contains("content")) + { + if (curr_msg["content"].is_string()) + { + content = curr_msg["content"].get(); + } + else if (curr_msg["content"].is_array()) + { + for (const auto &part : curr_msg["content"]) + { + if (part.contains("text")) + { + content += "\n" + part["text"].get(); + } + } + } + else + { + throw std::runtime_error( + "Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + } + } + else + { + throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + } + chat.push_back({role, content}); } @@ -409,24 +438,6 @@ static json oaicompat_completion_params_parse(const struct llama_model *model, llama_params["__oaicompat"] = true; - // Map OpenAI parameters to llama.cpp parameters - // - // For parameters that are defined by the OpenAI documentation (e.g. - // temperature), we explicitly specify OpenAI's intended default; we - // need to do that because sometimes OpenAI disagrees with llama.cpp - // - // https://platform.openai.com/docs/api-reference/chat/create - llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); - llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); - llama_params["stream"] = json_value(body, "stream", false); - llama_params["temperature"] = json_value(body, "temperature", 1.0); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - // Apply chat template to the list of messages llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java index 98342d37..3b34d3f3 100644 --- a/src/main/java/de/kherud/llama/ModelParameters.java +++ b/src/main/java/de/kherud/llama/ModelParameters.java @@ -54,7 +54,6 @@ public final class ModelParameters extends JsonParameters { private static final String PARAM_LOOKUP_CACHE_STATIC = "lookup_cache_static"; private static final String PARAM_LOOKUP_CACHE_DYNAMIC = "lookup_cache_dynamic"; private static final String PARAM_LORA_ADAPTER = "lora_adapter"; - private static final String PARAM_LORA_BASE = "lora_base"; private static final String PARAM_EMBEDDING = "embedding"; private static final String PARAM_CONT_BATCHING = "cont_batching"; private static final String PARAM_FLASH_ATTENTION = "flash_attn"; @@ -475,14 +474,6 @@ public ModelParameters setLoraAdapters(Map loraAdapters) { return this; } - /** - * Set an optional model to use as a base for the layers modified by the LoRA adapter - */ - public ModelParameters setLoraBase(String loraBase) { - parameters.put(PARAM_LORA_BASE, toJsonString(loraBase)); - return this; - } - /** * Whether to load model with embedding support */ From f3ded5c04cdca41b1475dc01c34c3c370e810d0d Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 18:29:51 +0200 Subject: [PATCH 07/24] add ggml shared library to binding --- CMakeLists.txt | 5 +-- .../java/de/kherud/llama/LlamaLoader.java | 38 +++++++++---------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7ce9fc9..32e9e2ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,6 @@ project(jllama CXX) include(FetchContent) set(BUILD_SHARED_LIBS ON) -set(LLAMA_STATIC OFF) set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(LLAMA_VERBOSE "llama: verbose output" OFF) @@ -98,11 +97,11 @@ target_compile_definitions(jllama PRIVATE ) if(OS_NAME STREQUAL "Windows") - set_target_properties(jllama llama PROPERTIES + set_target_properties(jllama llama ggml PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR} ) else() - set_target_properties(jllama llama PROPERTIES + set_target_properties(jllama llama ggml PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${JLLAMA_DIR} ) endif() diff --git a/src/main/java/de/kherud/llama/LlamaLoader.java b/src/main/java/de/kherud/llama/LlamaLoader.java index 5aa84001..a0239d20 100644 --- a/src/main/java/de/kherud/llama/LlamaLoader.java +++ b/src/main/java/de/kherud/llama/LlamaLoader.java @@ -62,6 +62,7 @@ static synchronized void initialize() throws UnsatisfiedLinkError { System.err.println("'ggml-metal.metal' not found"); } } + loadNativeLibrary("ggml"); loadNativeLibrary("llama"); loadNativeLibrary("jllama"); extracted = true; @@ -96,12 +97,7 @@ private static void cleanPath(Path path) { private static void loadNativeLibrary(String name) { List triedPaths = new LinkedList<>(); - // Try loading library from de.kherud.llama.lib.path library path - String nativeLibName = System.getProperty("de.kherud.llama.lib.name"); - if (nativeLibName == null) { - nativeLibName = System.mapLibraryName(name); - } - + String nativeLibName = System.mapLibraryName(name); String nativeLibPath = System.getProperty("de.kherud.llama.lib.path"); if (nativeLibPath != null) { Path path = Paths.get(nativeLibPath, nativeLibName); @@ -125,21 +121,7 @@ private static void loadNativeLibrary(String name) { } } - // Load the os-dependent library from the jar file - nativeLibPath = getNativeResourcePath(); - if (hasNativeLib(nativeLibPath, nativeLibName)) { - // temporary library folder - String tempFolder = getTempDir().getAbsolutePath(); - // Try extracting the library from jar - if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) { - return; - } - else { - triedPaths.add(nativeLibPath); - } - } - - // As a last resort try from java.library.path + // Try to load the library from java.library.path String javaLibraryPath = System.getProperty("java.library.path", ""); for (String ldPath : javaLibraryPath.split(File.pathSeparator)) { if (ldPath.isEmpty()) { @@ -154,6 +136,20 @@ private static void loadNativeLibrary(String name) { } } + // As a last resort try load the os-dependent library from the jar file + nativeLibPath = getNativeResourcePath(); + if (hasNativeLib(nativeLibPath, nativeLibName)) { + // temporary library folder + String tempFolder = getTempDir().getAbsolutePath(); + // Try extracting the library from jar + if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) { + return; + } + else { + triedPaths.add(nativeLibPath); + } + } + throw new UnsatisfiedLinkError( String.format( "No native library found for os.name=%s, os.arch=%s, paths=[%s]", From f3adb35a94704298bbfad574599d0f7769944890 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 18:30:15 +0200 Subject: [PATCH 08/24] update library compilation readme --- README.md | 86 +++++++++++++++++-------------------------------------- 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 2f2d2dfd..febf4f14 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) -The main goal of llama.cpp is to run the LLaMA model using 4-bit integer quantization on a MacBook. -This repository provides Java bindings for the C++ library. +Inference of Meta's LLaMA model (and others) in pure C/C++. **You are welcome to contribute** @@ -39,7 +38,7 @@ There are multiple [examples](src/test/java/examples): We support CPU inference for the following platforms out of the box: - Linux x86-64, aarch64 -- MacOS x86-64, aarch64 (M1) +- MacOS x86-64, aarch64 (M-series) - Windows x86-64, x64, arm (32 bit) If any of these match your platform, you can include the Maven dependency and get started. @@ -47,82 +46,49 @@ If any of these match your platform, you can include the Maven dependency and ge ### Setup required If none of the above listed platforms matches yours, currently you have to compile the library yourself (also if you -want GPU acceleration, see below). +want GPU acceleration). -This requires: +This consists of two steps: 1) Compiling the libraries and 2) putting them in the right location. -- Git -- A C++11 conforming compiler -- The [cmake](https://www.cmake.org/) build system -- Java, Maven, and setting [JAVA_HOME](https://www.baeldung.com/java-home-on-windows-7-8-10-mac-os-x-linux) +##### Library Compilation -Make sure everything works by running - -``` -g++ -v # depending on your compiler -java -version -mvn -v -echo $JAVA_HOME # for linux/macos -echo %JAVA_HOME% # for windows -``` - -Then, checkout [llama.cpp](https://github.com/ggerganov/llama.cpp) to know which build arguments to use (e.g. for CUDA support). -Finally, you have to run following commands in the directory of this repository (java-llama.cpp). -Remember to add your build arguments in the fourth line (`cmake ..`): +First, have a look at [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to know which build arguments to use (e.g. for CUDA support). +Any build option of llama.cpp works equivalently for this project. +You then have to run the following commands in the directory of this repository (java-llama.cpp): ```shell -mvn compile -mkdir build -cd build -cmake .. # add any other arguments for your backend -cmake --build . --config Release +mvn compile # don't forget this line +cmake -B build # add any other arguments for your backend, e.g. -DGGML_CUDA=ON +cmake --build build --config Release ``` > [!TIP] -> Use `-DLLAMA_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`. +> Use `-DGGML_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`. -All required files will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like: +All compiled libraries will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like: ```shell -- Installing files to /java-llama.cpp/src/main/resources/de/kherud/llama/Linux/x86_64 ``` -This includes: - -- Linux: `libllama.so`, `libjllama.so` -- MacOS: `libllama.dylib`, `libjllama.dylib`, `ggml-metal.metal` -- Windows: `llama.dll`, `jllama.dll` - -If you then compile your own JAR from this directory, you are ready to go. Otherwise, if you still want to use the library -as a Maven dependency, see below how to set the necessary paths in order for Java to find your compiled libraries. +#### Library Location -### Custom llama.cpp Setup (GPU acceleration) +This project has to load three shared libraries: -This repository provides default support for CPU based inference. You can compile `llama.cpp` any way you want, however (see [Setup Required](#setup-required)). -In order to use your self-compiled library, set either of the [JVM options](https://www.jetbrains.com/help/idea/tuning-the-ide.html#configure-jvm-options): +- ggml +- llama +- jllama -- `de.kherud.llama.lib.path`, for example `-Dde.kherud.llama.lib.path=/directory/containing/lib` -- `java.library.path`, for example `-Djava.library.path=/directory/containing/lib` - -This repository uses [`System#mapLibraryName`](https://docs.oracle.com/javase%2F7%2Fdocs%2Fapi%2F%2F/java/lang/System.html) to determine the name of the shared library for you platform. -If for any reason your library has a different name, you can set it with - -- `de.kherud.llama.lib.name`, for example `-Dde.kherud.llama.lib.name=myname.so` - -For compiling `llama.cpp`, refer to the official [readme](https://github.com/ggerganov/llama.cpp#build) for details. -The library can be built with the `llama.cpp` project: - -```shell -mkdir build -cd build -cmake .. -DBUILD_SHARED_LIBS=ON # add any other arguments for your backend -cmake --build . --config Release -``` +Note, that the file names vary between operating systems, e.g., `ggml.dll` on Windows, `libggml.so` on Linux, and `libggml.dylib` on macOS. -Look for the shared library in `build`. +The application will search in the following order in the following locations: -> [!IMPORTANT] -> If you are running MacOS with Metal, you have to put the file `ggml-metal.metal` from `build/bin` in the same directory as the shared library. +- In `de.kherud.llama.lib.path`: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`. +- In `java.library.path`: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux. + You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`. + Use this option if you want to install the shared libraries as system libraries. +- From the JAR: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library. + This of course only works for the [supported platforms](#no-setup-required) . ## Documentation From 312abb1c5164091b7d7a4b9083ba76f691893cc4 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 18:48:08 +0200 Subject: [PATCH 09/24] minor readme update --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index febf4f14..60a1dcec 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Access this library via Maven: ``` -There are multiple [examples](src/test/java/examples): +There are multiple [examples](src/test/java/examples). ### No Setup required @@ -83,13 +83,17 @@ Note, that the file names vary between operating systems, e.g., `ggml.dll` on Wi The application will search in the following order in the following locations: -- In `de.kherud.llama.lib.path`: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`. -- In `java.library.path`: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux. +- In **de.kherud.llama.lib.path**: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`. +- In **java.library.path**: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux. You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`. Use this option if you want to install the shared libraries as system libraries. -- From the JAR: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library. +- From the **JAR**: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library. This of course only works for the [supported platforms](#no-setup-required) . +Not all libraries have to be in the same location. +For example, if you already have a llama.cpp and ggml version you can install them as a system library and rely on the jllama library from the JAR. +This way, you don't have to compile anything. + ## Documentation ### Example From 255679ddad142577a1dd55815bbaab5d49bcccdb Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 18:48:19 +0200 Subject: [PATCH 10/24] update pom.xml version to 3.3.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 95bf822b..5b00bb42 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ de.kherud llama - 3.2.2 + 3.3.0 jar ${project.groupId}:${project.artifactId} From 775089a1226051cc861a489218d0e45685ee2145 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 17:19:20 +0200 Subject: [PATCH 11/24] add debug statement --- src/test/java/de/kherud/llama/LlamaModelTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java index c7ece673..8fa8baa4 100644 --- a/src/test/java/de/kherud/llama/LlamaModelTest.java +++ b/src/test/java/de/kherud/llama/LlamaModelTest.java @@ -213,6 +213,7 @@ public void testLogStdout() { System.out.println("########## Log Text ##########"); LlamaModel.setLogger(LogFormat.TEXT, null); + System.out.println("DEBUG: Logger set"); model.complete(params); System.out.println("########## Log JSON ##########"); From ca988dbfe3bdf5848c5148317cc4765cf1cf49f2 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 17:36:37 +0200 Subject: [PATCH 12/24] update debug statements --- src/main/cpp/jllama.cpp | 12 ++++++++++++ src/main/cpp/server.hpp | 4 ++++ src/main/cpp/utils.hpp | 12 ++++++++++++ src/test/java/de/kherud/llama/LlamaModelTest.java | 1 - 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index d59f3b77..2e94b446 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -477,10 +477,14 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv jlong server_handle = env->GetLongField(obj, f_model_pointer); auto *ctx_server = reinterpret_cast(server_handle); // NOLINT(*-no-int-to-ptr) + std::cout << "DEBUG " << 1 << std::endl; + std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); const bool infill = json_params.contains("input_prefix") || json_params.contains("input_suffix"); + std::cout << "DEBUG " << 2 << std::endl; + if (json_params.value("use_chat_template", false)) { json chat; @@ -489,8 +493,12 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat); } + std::cout << "DEBUG " << 3 << std::endl; + const int id_task = ctx_server->queue_tasks.get_new_id(); + std::cout << "DEBUG " << 4 << std::endl; ctx_server->queue_results.add_waiting_task_id(id_task); + std::cout << "DEBUG " << 5 << std::endl; ctx_server->request_completion(id_task, -1, json_params, infill, false); return id_task; @@ -501,6 +509,7 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE jlong server_handle = env->GetLongField(obj, f_model_pointer); auto *ctx_server = reinterpret_cast(server_handle); // NOLINT(*-no-int-to-ptr) + std::cout << "DEBUG " << 8 << std::endl; server_task_result result = ctx_server->queue_results.recv(id_task); if (result.error) @@ -510,12 +519,14 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE env->ThrowNew(c_llama_error, response.c_str()); return nullptr; } + std::cout << "DEBUG " << 9 << std::endl; std::string response = result.data["content"].get(); if (result.stop) { ctx_server->queue_results.remove_waiting_task_id(id_task); } + std::cout << "DEBUG " << 10 << std::endl; jobject o_probabilities = env->NewObject(c_hash_map, cc_hash_map); if (result.data.contains("completion_probabilities")) @@ -536,6 +547,7 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE } } } + std::cout << "DEBUG " << 11 << std::endl; jbyteArray jbytes = parse_jbytes(env, response); return env->NewObject(c_output, cc_output, jbytes, o_probabilities, result.stop); diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index 0601dac4..fcbe167b 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -1661,6 +1661,8 @@ struct server_context task.embedding = embedding; task.type = SERVER_TASK_TYPE_COMPLETION; + std::cout << "DEBUG " << 6 << std::endl; + // when a completion task's prompt array is not a singleton, we split it into multiple requests // otherwise, it's a single-prompt task, we actually queue it // if there's numbers in the prompt array it will be treated as an array of tokens @@ -1694,6 +1696,8 @@ struct server_context { queue_tasks.post(task); } + + std::cout << "DEBUG " << 7 << std::endl; } void request_cancel(int id_task) diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp index 7de7eac4..9926ea97 100644 --- a/src/main/cpp/utils.hpp +++ b/src/main/cpp/utils.hpp @@ -89,6 +89,7 @@ static const char *log_level_to_string(ggml_log_level level) static inline void server_log(ggml_log_level level, const char *function, int line, const char *message, const json &extra) { + std::cout << "DEBUG LOG " << 1 << std::endl; std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); @@ -119,32 +120,43 @@ static inline void server_log(ggml_log_level level, const char *function, int li } else { + std::cout << "DEBUG LOG " << 2 << std::endl; std::stringstream ss; ss << message; + std::cout << "DEBUG LOG " << 3 << std::endl; if (!extra.empty()) { + std::cout << "DEBUG LOG " << 4 << std::endl; for (const auto &el : extra.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); ss << " " << el.key() << "=" << value; } + std::cout << "DEBUG LOG " << 5 << std::endl; } + std::cout << "DEBUG LOG " << 6 << std::endl; #if SERVER_VERBOSE ss << " | ts " << time(nullptr) << " | tid " << ss_tid.str() << " | " << function << " line " << line; #endif + std::cout << "DEBUG LOG " << 7 << std::endl; const std::string str = ss.str(); if (log_callback == nullptr) { + std::cout << "DEBUG LOG " << 8 << std::endl; printf("[%4s] %.*s\n", log_level_to_string(level), (int)str.size(), str.data()); + std::cout << "DEBUG LOG " << 9 << std::endl; } else { + std::cout << "DEBUG LOG " << 10 << std::endl; log_callback(level, str.c_str(), nullptr); + std::cout << "DEBUG LOG " << 11 << std::endl; } } + std::cout << "DEBUG LOG " << 12 << std::endl; fflush(stdout); } diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java index 8fa8baa4..c7ece673 100644 --- a/src/test/java/de/kherud/llama/LlamaModelTest.java +++ b/src/test/java/de/kherud/llama/LlamaModelTest.java @@ -213,7 +213,6 @@ public void testLogStdout() { System.out.println("########## Log Text ##########"); LlamaModel.setLogger(LogFormat.TEXT, null); - System.out.println("DEBUG: Logger set"); model.complete(params); System.out.println("########## Log JSON ##########"); From ed73e3459ae95d5878c4ec0a4d546c014eff08ad Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 19:37:32 +0200 Subject: [PATCH 13/24] update debug statements --- src/main/cpp/jllama.cpp | 15 +++------------ src/main/cpp/server.hpp | 4 ---- src/main/cpp/utils.hpp | 12 ------------ src/main/java/de/kherud/llama/LlamaModel.java | 1 + 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 2e94b446..4fffa9bc 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -358,6 +358,8 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo auto *ctx_server = new server_context(); + std::cout << "New model: " << ctx_server << std::endl; + std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); server_params_parse(json_params, params); @@ -476,15 +478,12 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv { jlong server_handle = env->GetLongField(obj, f_model_pointer); auto *ctx_server = reinterpret_cast(server_handle); // NOLINT(*-no-int-to-ptr) - - std::cout << "DEBUG " << 1 << std::endl; + std::cout << "Request completion: " << ctx_server << std::endl; std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); const bool infill = json_params.contains("input_prefix") || json_params.contains("input_suffix"); - std::cout << "DEBUG " << 2 << std::endl; - if (json_params.value("use_chat_template", false)) { json chat; @@ -493,12 +492,8 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat); } - std::cout << "DEBUG " << 3 << std::endl; - const int id_task = ctx_server->queue_tasks.get_new_id(); - std::cout << "DEBUG " << 4 << std::endl; ctx_server->queue_results.add_waiting_task_id(id_task); - std::cout << "DEBUG " << 5 << std::endl; ctx_server->request_completion(id_task, -1, json_params, infill, false); return id_task; @@ -509,7 +504,6 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE jlong server_handle = env->GetLongField(obj, f_model_pointer); auto *ctx_server = reinterpret_cast(server_handle); // NOLINT(*-no-int-to-ptr) - std::cout << "DEBUG " << 8 << std::endl; server_task_result result = ctx_server->queue_results.recv(id_task); if (result.error) @@ -519,14 +513,12 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE env->ThrowNew(c_llama_error, response.c_str()); return nullptr; } - std::cout << "DEBUG " << 9 << std::endl; std::string response = result.data["content"].get(); if (result.stop) { ctx_server->queue_results.remove_waiting_task_id(id_task); } - std::cout << "DEBUG " << 10 << std::endl; jobject o_probabilities = env->NewObject(c_hash_map, cc_hash_map); if (result.data.contains("completion_probabilities")) @@ -547,7 +539,6 @@ JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletion(JNIE } } } - std::cout << "DEBUG " << 11 << std::endl; jbyteArray jbytes = parse_jbytes(env, response); return env->NewObject(c_output, cc_output, jbytes, o_probabilities, result.stop); diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index fcbe167b..0601dac4 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -1661,8 +1661,6 @@ struct server_context task.embedding = embedding; task.type = SERVER_TASK_TYPE_COMPLETION; - std::cout << "DEBUG " << 6 << std::endl; - // when a completion task's prompt array is not a singleton, we split it into multiple requests // otherwise, it's a single-prompt task, we actually queue it // if there's numbers in the prompt array it will be treated as an array of tokens @@ -1696,8 +1694,6 @@ struct server_context { queue_tasks.post(task); } - - std::cout << "DEBUG " << 7 << std::endl; } void request_cancel(int id_task) diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp index 9926ea97..7de7eac4 100644 --- a/src/main/cpp/utils.hpp +++ b/src/main/cpp/utils.hpp @@ -89,7 +89,6 @@ static const char *log_level_to_string(ggml_log_level level) static inline void server_log(ggml_log_level level, const char *function, int line, const char *message, const json &extra) { - std::cout << "DEBUG LOG " << 1 << std::endl; std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); @@ -120,43 +119,32 @@ static inline void server_log(ggml_log_level level, const char *function, int li } else { - std::cout << "DEBUG LOG " << 2 << std::endl; std::stringstream ss; ss << message; - std::cout << "DEBUG LOG " << 3 << std::endl; if (!extra.empty()) { - std::cout << "DEBUG LOG " << 4 << std::endl; for (const auto &el : extra.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); ss << " " << el.key() << "=" << value; } - std::cout << "DEBUG LOG " << 5 << std::endl; } - std::cout << "DEBUG LOG " << 6 << std::endl; #if SERVER_VERBOSE ss << " | ts " << time(nullptr) << " | tid " << ss_tid.str() << " | " << function << " line " << line; #endif - std::cout << "DEBUG LOG " << 7 << std::endl; const std::string str = ss.str(); if (log_callback == nullptr) { - std::cout << "DEBUG LOG " << 8 << std::endl; printf("[%4s] %.*s\n", log_level_to_string(level), (int)str.size(), str.data()); - std::cout << "DEBUG LOG " << 9 << std::endl; } else { - std::cout << "DEBUG LOG " << 10 << std::endl; log_callback(level, str.c_str(), nullptr); - std::cout << "DEBUG LOG " << 11 << std::endl; } } - std::cout << "DEBUG LOG " << 12 << std::endl; fflush(stdout); } diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java index b78e056e..5a34935c 100644 --- a/src/main/java/de/kherud/llama/LlamaModel.java +++ b/src/main/java/de/kherud/llama/LlamaModel.java @@ -42,6 +42,7 @@ public class LlamaModel implements AutoCloseable { */ public LlamaModel(ModelParameters parameters) { loadModel(parameters.toString()); + System.out.println(ctx); } /** From c6fcef46d7c23a6c37b227bf21e8083f018db450 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 19:37:42 +0200 Subject: [PATCH 14/24] update to llama.cpp b3534 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32e9e2ff..1d0e8e98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ FetchContent_MakeAvailable(json) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3525 + GIT_TAG b3534 ) FetchContent_MakeAvailable(llama.cpp) From 913d201ede8a85299aa72e4ee2fe99ef4820330d Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Tue, 6 Aug 2024 20:08:52 +0200 Subject: [PATCH 15/24] update debug output --- .github/workflows/ci.yml | 10 ++++++++++ src/main/cpp/jllama.cpp | 3 --- src/main/java/de/kherud/llama/LlamaModel.java | 1 - 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fef12a6..60a06ccd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,11 @@ jobs: run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Run tests run: mvn test + - if: failure() + uses: actions/upload-artifact@v3 + with: + path: ${{ github.workspace }}/hs_err_pid*.log + if-no-files-found: warn build-and-test-macos: name: ${{ matrix.target.runner }} @@ -75,3 +80,8 @@ jobs: run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME - name: Run tests run: mvn test + - if: failure() + uses: actions/upload-artifact@v3 + with: + path: ${{ github.workspace }}\hs_err_pid*.log + if-no-files-found: warn diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 4fffa9bc..d59f3b77 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -358,8 +358,6 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo auto *ctx_server = new server_context(); - std::cout << "New model: " << ctx_server << std::endl; - std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); server_params_parse(json_params, params); @@ -478,7 +476,6 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv { jlong server_handle = env->GetLongField(obj, f_model_pointer); auto *ctx_server = reinterpret_cast(server_handle); // NOLINT(*-no-int-to-ptr) - std::cout << "Request completion: " << ctx_server << std::endl; std::string c_params = parse_jstring(env, jparams); json json_params = json::parse(c_params); diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java index 5a34935c..b78e056e 100644 --- a/src/main/java/de/kherud/llama/LlamaModel.java +++ b/src/main/java/de/kherud/llama/LlamaModel.java @@ -42,7 +42,6 @@ public class LlamaModel implements AutoCloseable { */ public LlamaModel(ModelParameters parameters) { loadModel(parameters.toString()); - System.out.println(ctx); } /** From 08c42561e29db5c90d46bed5c99af4dcb0f462df Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 19:51:21 +0200 Subject: [PATCH 16/24] run model in Java thread for debugging --- src/main/cpp/jllama.cpp | 17 ++--------------- src/main/java/de/kherud/llama/LlamaModel.java | 10 +++++++++- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index d59f3b77..03405d5b 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -454,22 +454,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo std::placeholders::_1, std::placeholders::_2, std::placeholders::_3)); - std::thread t([ctx_server]() { - JNIEnv *env; - jint res = g_vm->GetEnv((void **)&env, JNI_VERSION_1_6); - if (res == JNI_EDETACHED) - { - res = g_vm->AttachCurrentThread((void **)&env, nullptr); - if (res != JNI_OK) - { - throw std::runtime_error("Failed to attach thread to JVM"); - } - } - ctx_server->queue_tasks.start_loop(); - }); - t.detach(); - env->SetLongField(obj, f_model_pointer, reinterpret_cast(ctx_server)); + + ctx_server->queue_tasks.start_loop(); } JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv *env, jobject obj, jstring jparams) diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java index b78e056e..26246d50 100644 --- a/src/main/java/de/kherud/llama/LlamaModel.java +++ b/src/main/java/de/kherud/llama/LlamaModel.java @@ -28,6 +28,7 @@ public class LlamaModel implements AutoCloseable { @Native private long ctx; + private final Thread modelThread; /** * Load with the given {@link ModelParameters}. Make sure to either set @@ -41,7 +42,14 @@ public class LlamaModel implements AutoCloseable { * @throws LlamaException if no model could be loaded from the given file path */ public LlamaModel(ModelParameters parameters) { - loadModel(parameters.toString()); + this.modelThread = new Thread(() -> loadModel(parameters.toString())); + this.modelThread.start(); + try { + Thread.sleep(30000); + } + catch (InterruptedException e) { + throw new RuntimeException(e); + } } /** From 0b6dff5a6aca925f229a8753cc51c4aae952086c Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:04:52 +0200 Subject: [PATCH 17/24] windows cmake build in debug mode --- .github/build.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/build.bat b/.github/build.bat index a904405e..89ebaec4 100755 --- a/.github/build.bat +++ b/.github/build.bat @@ -2,6 +2,6 @@ mkdir build cmake -Bbuild %* -cmake --build build --config Release +cmake --build build --config Debug -if errorlevel 1 exit /b %ERRORLEVEL% \ No newline at end of file +if errorlevel 1 exit /b %ERRORLEVEL% From 405a83f64870c402e8e1b51a8bb0b3fd326143fa Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:13:12 +0200 Subject: [PATCH 18/24] ci workflow add verbose flag --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60a06ccd..17923928 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: # cmake should figure out OS and ARCH automatically when running build.sh (but we need mvn compile for it) run: | mvn compile - .github/build.sh + .github/build.sh -DLLAMA_VERBOSE=ON - name: Download model run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Run tests @@ -42,11 +42,11 @@ jobs: target: - { runner: macos-13, - cmake: '-DLLAMA_METAL=OFF' + cmake: '-DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON' } - { runner: macos-14, - cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF' + cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON' } steps: - uses: actions/checkout@v4 @@ -75,7 +75,7 @@ jobs: - name: Build libraries run: | mvn compile - .github\build.bat + .github\build.bat -DLLAMA_VERBOSE=ON - name: Download model run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME - name: Run tests From 58b9889fd4edbbeb5de9a7bf7fc50216a1d969e6 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:13:26 +0200 Subject: [PATCH 19/24] cmake file update windows debug output --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d0e8e98..43a0c725 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,7 @@ target_compile_definitions(jllama PRIVATE if(OS_NAME STREQUAL "Windows") set_target_properties(jllama llama ggml PROPERTIES + RUNTIME_OUTPUT_DIRECTORY_DEBUG ${JLLAMA_DIR} RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR} ) else() From a3ad9fcd491c4b710c4368712790f12a4f7bed5f Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:22:24 +0200 Subject: [PATCH 20/24] Revert "run model in Java thread for debugging" This reverts commit 08c42561e29db5c90d46bed5c99af4dcb0f462df. --- src/main/cpp/jllama.cpp | 17 +++++++++++++++-- src/main/java/de/kherud/llama/LlamaModel.java | 10 +--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 03405d5b..d59f3b77 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -454,9 +454,22 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo std::placeholders::_1, std::placeholders::_2, std::placeholders::_3)); - env->SetLongField(obj, f_model_pointer, reinterpret_cast(ctx_server)); + std::thread t([ctx_server]() { + JNIEnv *env; + jint res = g_vm->GetEnv((void **)&env, JNI_VERSION_1_6); + if (res == JNI_EDETACHED) + { + res = g_vm->AttachCurrentThread((void **)&env, nullptr); + if (res != JNI_OK) + { + throw std::runtime_error("Failed to attach thread to JVM"); + } + } + ctx_server->queue_tasks.start_loop(); + }); + t.detach(); - ctx_server->queue_tasks.start_loop(); + env->SetLongField(obj, f_model_pointer, reinterpret_cast(ctx_server)); } JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv *env, jobject obj, jstring jparams) diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java index 26246d50..b78e056e 100644 --- a/src/main/java/de/kherud/llama/LlamaModel.java +++ b/src/main/java/de/kherud/llama/LlamaModel.java @@ -28,7 +28,6 @@ public class LlamaModel implements AutoCloseable { @Native private long ctx; - private final Thread modelThread; /** * Load with the given {@link ModelParameters}. Make sure to either set @@ -42,14 +41,7 @@ public class LlamaModel implements AutoCloseable { * @throws LlamaException if no model could be loaded from the given file path */ public LlamaModel(ModelParameters parameters) { - this.modelThread = new Thread(() -> loadModel(parameters.toString())); - this.modelThread.start(); - try { - Thread.sleep(30000); - } - catch (InterruptedException e) { - throw new RuntimeException(e); - } + loadModel(parameters.toString()); } /** From 45235726f83f145ca86127d7152e25e90eeaa367 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:22:40 +0200 Subject: [PATCH 21/24] Revert "windows cmake build in debug mode" This reverts commit 0b6dff5a6aca925f229a8753cc51c4aae952086c. --- .github/build.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/build.bat b/.github/build.bat index 89ebaec4..a904405e 100755 --- a/.github/build.bat +++ b/.github/build.bat @@ -2,6 +2,6 @@ mkdir build cmake -Bbuild %* -cmake --build build --config Debug +cmake --build build --config Release -if errorlevel 1 exit /b %ERRORLEVEL% +if errorlevel 1 exit /b %ERRORLEVEL% \ No newline at end of file From 7ed0dbe4d7916ee259ff2fd2631a01b6a1ea0746 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:32:53 +0200 Subject: [PATCH 22/24] ignore log stdout test --- src/test/java/de/kherud/llama/LlamaModelTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java index c7ece673..01f98b79 100644 --- a/src/test/java/de/kherud/llama/LlamaModelTest.java +++ b/src/test/java/de/kherud/llama/LlamaModelTest.java @@ -8,6 +8,7 @@ import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; public class LlamaModelTest { @@ -204,6 +205,7 @@ public void testLogJSON() { } } + @Ignore @Test public void testLogStdout() { // Unfortunately, `printf` can't be easily re-directed to Java. This test only works manually, thus. From 458de579d3a624c368827153673f85c3bba4d634 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:47:32 +0200 Subject: [PATCH 23/24] Revert "ignore log stdout test" This reverts commit 7ed0dbe4d7916ee259ff2fd2631a01b6a1ea0746. --- src/test/java/de/kherud/llama/LlamaModelTest.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java index 01f98b79..c7ece673 100644 --- a/src/test/java/de/kherud/llama/LlamaModelTest.java +++ b/src/test/java/de/kherud/llama/LlamaModelTest.java @@ -8,7 +8,6 @@ import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; public class LlamaModelTest { @@ -205,7 +204,6 @@ public void testLogJSON() { } } - @Ignore @Test public void testLogStdout() { // Unfortunately, `printf` can't be easily re-directed to Java. This test only works manually, thus. From 18a0e8b8feef60260e3caf4635adf81455c0b008 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Wed, 7 Aug 2024 20:51:22 +0200 Subject: [PATCH 24/24] release workflow enable macos test, disable windows --- .github/workflows/release.yaml | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index fc88d112..7d01ef41 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -137,25 +137,24 @@ jobs: - name: Run tests run: mvn test - # disabled for now, we don't have access to a macos arm64 runner and testing on x86_64 doesn't work -# test-macos: -# name: Test Mac -# needs: build-macos-native -# runs-on: macos-latest -# steps: -# - uses: actions/checkout@v4 -# - uses: actions/download-artifact@v3 -# with: -# name: artifacts -# path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/ -# - name: Download model -# run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} -# - uses: actions/setup-java@v4 -# with: -# distribution: 'zulu' -# java-version: '11' -# - name: Run tests -# run: mvn test + test-macos: + name: Test Mac + needs: build-macos-native + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v3 + with: + name: artifacts + path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/ + - name: Download model + run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' + - name: Run tests + run: mvn test test-windows: