From a90a5e8687801a7cdcd797eb6f9e8c140df91d76 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 16 May 2025 07:11:48 +0700 Subject: [PATCH 1/8] fix: check model is loaded before starting (#2206) * fix: check model is loaded before starting * chore: e2e test --------- Co-authored-by: sangjanai --- engine/config/yaml_config.cc | 2 +- .../cli/engines/test_cli_engine_uninstall.py | 5 +++- engine/e2e-test/cli/model/test_cli_model.py | 1 + .../extensions/local-engine/local_engine.cc | 29 ++++++++++++++++--- engine/services/model_source_service.cc | 3 +- 5 files changed, 32 insertions(+), 8 deletions(-) diff --git a/engine/config/yaml_config.cc b/engine/config/yaml_config.cc index 9650ffdcc..38128e1c4 100644 --- a/engine/config/yaml_config.cc +++ b/engine/config/yaml_config.cc @@ -48,7 +48,7 @@ void YamlHandler::ReadYamlFile(const std::string& file_path) { if (!yaml_node_["mmproj"]) { auto s = nomalize_path(file_path); auto abs_path = s.substr(0, s.find_last_of('/')) + "/mmproj.gguf"; - CTL_DBG("mmproj: " << abs_path); + CTL_TRC("mmproj: " << abs_path); auto rel_path = fmu::ToRelativeCortexDataPath(fs::path(abs_path)); if (std::filesystem::exists(abs_path)) { yaml_node_["mmproj"] = rel_path.string(); diff --git a/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py b/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py index 8672110e2..3198c81a5 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py @@ -24,7 +24,10 @@ def setup_and_teardown(self): @pytest.mark.asyncio async def test_engines_uninstall_llamacpp_should_be_successfully(self): - response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install") + data = {"version": "b5371"} + response = requests.post( + "http://localhost:3928/v1/engines/llama-cpp/install", json=data + ) await wait_for_websocket_download_success_event(timeout=None) exit_code, output, error = run( "Uninstall engine", ["engines", "uninstall", "llama-cpp"] diff --git a/engine/e2e-test/cli/model/test_cli_model.py b/engine/e2e-test/cli/model/test_cli_model.py index aa6e99e4a..cd80a9e2b 100644 --- a/engine/e2e-test/cli/model/test_cli_model.py +++ b/engine/e2e-test/cli/model/test_cli_model.py @@ -36,6 +36,7 @@ def setup_and_teardown(self): run("Delete model", ["models", "delete", "tinyllama:1b"]) stop_server() + @pytest.mark.skipif(platform.system() == "Windows", reason="Skip test for Windows") def test_model_pull_with_direct_url_should_be_success(self): exit_code, output, error = run( "Pull model", diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index b769c5e8c..2bba11a7b 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -80,6 +80,11 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--no-mmap"); } continue; + } else if (member == "ignore_eos") { + if (root[member].asBool()) { + res.push_back("--ignore_eos"); + } + continue; } res.push_back("--" + member); @@ -502,6 +507,23 @@ void LocalEngine::HandleEmbedding(std::shared_ptr json_body, void LocalEngine::LoadModel(std::shared_ptr json_body, http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + CTL_INF("Model " << model_id << " is already loaded"); + Json::Value error; + error["error"] = "Model " + model_id + " is already loaded"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 409; + callback(std::move(status), std::move(error)); + return; + } + CTL_INF("Start loading model"); auto wait_for_server_up = [this](const std::string& model, const std::string& host, int port) { @@ -524,10 +546,7 @@ void LocalEngine::LoadModel(std::shared_ptr json_body, }; LOG_DEBUG << "Start to spawn llama-server"; - auto model_id = json_body->get("model", "").asString(); - if (model_id.empty()) { - CTL_WRN("Model is empty"); - } + server_map_[model_id].host = "127.0.0.1"; server_map_[model_id].port = GenerateRandomInteger(39400, 39999); auto& s = server_map_[model_id]; @@ -545,6 +564,8 @@ void LocalEngine::LoadModel(std::shared_ptr json_body, params.push_back("--pooling"); params.push_back("mean"); + params.push_back("--jinja"); + std::vector v; v.reserve(params.size() + 1); auto engine_dir = engine_service_.GetEngineDirPath(kLlamaRepo); diff --git a/engine/services/model_source_service.cc b/engine/services/model_source_service.cc index b5979667c..661b9b580 100644 --- a/engine/services/model_source_service.cc +++ b/engine/services/model_source_service.cc @@ -433,8 +433,7 @@ cpp::result ModelSourceService::AddCortexsoRepo( auto author = hub_author; auto model_author = hu::GetModelAuthorCortexsoHub(model_name); - if (auto model_author = hu::GetModelAuthorCortexsoHub(model_name); - model_author.has_value() && !model_author.value().empty()) { + if (model_author.has_value() && !model_author.value().empty()) { author = model_author.value(); } From 3a638267c26016d25b53ef936325b0b765487f1f Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 12 Jun 2025 13:47:26 +0530 Subject: [PATCH 2/8] Hostfix: remove not needed params from load_model (#2209) * refactor: remove --pooling flag from model loading The --pooling flag was removed as the mean pooling functionality not needed in chat models. This fixes the regression * feat(local-engine): add ctx_len parameter support Adds support for the ctx_len parameter by appending --ctx-size with its value. Removed outdated parameter mappings from the kParamsMap to reflect current implementation details and ensure consistency. * feat: add conditional model parameters based on path When the model path contains both "jan" and "nano" (case-insensitive), automatically add speculative decoding parameters to adjust generation behavior. This improves flexibility by enabling environment-specific configurations without manual parameter tuning. Also includes necessary headers for string manipulation and fixes whitespace in ctx_len handling. * chore: remove redundant comment The comment was redundant as the code's purpose is clear without it, improving readability. --- .../extensions/local-engine/local_engine.cc | 28 ++++++++++++++---- engine/services/model_service.cc | 29 ++++++++++++------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 2bba11a7b..beda1f44b 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -1,6 +1,9 @@ #include "local_engine.h" +#include #include +#include #include +#include #include #include "utils/curl_utils.h" #include "utils/json_helper.h" @@ -20,6 +23,7 @@ const std::unordered_set kIgnoredParams = { "user_prompt", "min_keep", "mirostat", "mirostat_eta", "mirostat_tau", "text_model", "version", "n_probs", "object", "penalize_nl", "precision", "size", + "flash_attn", "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { @@ -42,18 +46,24 @@ int GenerateRandomInteger(int min, int max) { std::uniform_int_distribution<> dis( min, max); // Distribution for the desired range - return dis(gen); // Generate and return a random integer within the range + return dis(gen); } std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::vector res; - std::string errors; for (const auto& member : root.getMemberNames()) { if (member == "model_path" || member == "llama_model_path") { if (!root[member].isNull()) { + const std::string path = root[member].asString(); res.push_back("--model"); - res.push_back(root[member].asString()); + res.push_back(path); + + // If path contains both "Jan" and "nano", case-insensitive, add special params + std::string lowered = path; + std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { + return std::tolower(c); + }); } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { @@ -85,8 +95,15 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--ignore_eos"); } continue; + } else if (member == "ctx_len") { + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; } + // Generic handling for other members res.push_back("--" + member); if (root[member].isString()) { res.push_back(root[member].asString()); @@ -105,7 +122,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { ss << "\"" << value.asString() << "\""; first = false; } - ss << "] "; + ss << "]"; res.push_back(ss.str()); } } @@ -113,6 +130,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { return res; } + constexpr const auto kMinDataChunkSize = 6u; struct OaiInfo { @@ -561,8 +579,6 @@ void LocalEngine::LoadModel(std::shared_ptr json_body, params.push_back("--port"); params.push_back(std::to_string(s.port)); - params.push_back("--pooling"); - params.push_back("mean"); params.push_back("--jinja"); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 68f0fe070..2da6c749e 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr db_service, download_service_{download_service}, inference_svc_(inference_service), engine_svc_(engine_svc), - task_queue_(task_queue) { - // ProcessBgrTasks(); + task_queue_(task_queue){ + // ProcessBgrTasks(); }; void ModelService::ForceIndexingModelList() { @@ -557,6 +557,8 @@ cpp::result ModelService::StartModel( if (auto& o = params_override["ctx_len"]; !o.isNull()) { ctx_len = o.asInt(); } + Json::Value model_load_params; + json_helper::MergeJson(model_load_params, params_override); try { constexpr const int kDefautlContextLength = 8192; @@ -630,6 +632,8 @@ cpp::result ModelService::StartModel( #else json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); #endif } else { LOG_WARN << "model_path is empty"; @@ -642,6 +646,8 @@ cpp::result ModelService::StartModel( #else json_data["mmproj"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); #endif } json_data["system_prompt"] = mc.system_template; @@ -655,6 +661,7 @@ cpp::result ModelService::StartModel( } json_data["model"] = model_handle; + model_load_params["model"] = model_handle; if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) { auto parse_prompt_result = string_utils::ParsePrompt(cpt.value()); json_data["system_prompt"] = parse_prompt_result.system_prompt; @@ -662,8 +669,6 @@ cpp::result ModelService::StartModel( json_data["ai_prompt"] = parse_prompt_result.ai_prompt; } - json_helper::MergeJson(json_data, params_override); - // Set default cpu_threads if it is not configured if (!json_data.isMember("cpu_threads")) { json_data["cpu_threads"] = GetCpuThreads(); @@ -686,12 +691,12 @@ cpp::result ModelService::StartModel( assert(!!inference_svc_); - auto ir = - inference_svc_->LoadModel(std::make_shared(json_data)); + auto ir = inference_svc_->LoadModel( + std::make_shared(model_load_params)); auto status = std::get<0>(ir)["status_code"].asInt(); auto data = std::get<1>(ir); - if (status == drogon::k200OK) { + if (status == drogon::k200OK) { return StartModelResult{/* .success = */ true, /* .warning = */ may_fallback_res.value()}; } else if (status == drogon::k409Conflict) { @@ -1031,13 +1036,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, auto es = hardware::EstimateLLaMACppRun(model_path, rc); if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB - << ", available: " << free_vram_MiB); + CTL_WRN("Not enough VRAM - " + << "required: " << (*es).gpu_mode.vram_MiB + << ", available: " << free_vram_MiB); } if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB - << ", available: " << free_ram_MiB); + CTL_WRN("Not enough RAM - " + << "required: " << (*es).cpu_mode.ram_MiB + << ", available: " << free_ram_MiB); } return warning; From 9e87efca30e017c052c74e574bb998c9b0692a7a Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 12 Jun 2025 19:51:52 +0700 Subject: [PATCH 3/8] fix: do not ignore client request param (#2210) --- engine/extensions/local-engine/local_engine.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index beda1f44b..adc8649f6 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -23,7 +23,6 @@ const std::unordered_set kIgnoredParams = { "user_prompt", "min_keep", "mirostat", "mirostat_eta", "mirostat_tau", "text_model", "version", "n_probs", "object", "penalize_nl", "precision", "size", - "flash_attn", "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { From aab35862051ec45239e69060e4c150374a7d2bf9 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 14 Jun 2025 11:03:14 +0530 Subject: [PATCH 4/8] feat: add reasoning_budget parameter to params map (#2211) --- engine/extensions/local-engine/local_engine.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index adc8649f6..74bf0d1b8 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -37,6 +37,7 @@ const std::unordered_map kParamsMap = { {"dynatemp_exponent", "--dynatemp-exp"}, {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, + {"reasoning_budget", "--reasoning-budget"}, }; int GenerateRandomInteger(int min, int max) { @@ -50,6 +51,8 @@ int GenerateRandomInteger(int min, int max) { std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::vector res; + std::string errors; + res.push_back("--no-webui"); for (const auto& member : root.getMemberNames()) { if (member == "model_path" || member == "llama_model_path") { From 62b74b7af9f94e23d4ffd10d8a185d0efb8abb12 Mon Sep 17 00:00:00 2001 From: ethanova Date: Sat, 14 Jun 2025 02:29:32 -0400 Subject: [PATCH 5/8] fix bug where for local models, delete only the model passed in, not all local models (#2207) Co-authored-by: Ethan Garber Co-authored-by: Akarshan Biswas --- engine/services/model_service.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 2da6c749e..51e42ff81 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -500,13 +500,10 @@ cpp::result ModelService::DeleteModel( std::filesystem::remove(yaml_fp); CTL_INF("Removed: " << yaml_fp.string()); } else { - // Remove yaml files - for (const auto& entry : - std::filesystem::directory_iterator(yaml_fp.parent_path())) { - if (entry.is_regular_file() && (entry.path().extension() == ".yml")) { - std::filesystem::remove(entry); - CTL_INF("Removed: " << entry.path().string()); - } + // Is a local model - Remove only this model's yaml file + if (std::filesystem::exists(yaml_fp)) { + std::filesystem::remove(yaml_fp); + CTL_INF("Removed: " << yaml_fp.string()); } } From 4cc2166204ef2c3bddb187c9ea3055caca615c9e Mon Sep 17 00:00:00 2001 From: Louis Date: Sun, 15 Jun 2025 16:44:09 +0700 Subject: [PATCH 6/8] fix: model lookup issue on Windows (#2213) --- engine/services/model_service.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 51e42ff81..a3771e0a1 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -626,6 +626,9 @@ cpp::result ModelService::StartModel( #if defined(_WIN32) json_data["model_path"] = cortex::wc::WstringToUtf8( fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); + model_load_params["model_path"] = + cortex::wc::WstringToUtf8( + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); #else json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); From cc390f26400ce80732fa81fc6f20ae23a1f88795 Mon Sep 17 00:00:00 2001 From: Service Account Date: Fri, 4 Jul 2025 14:33:36 +0700 Subject: [PATCH 7/8] chore: update readme --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 5cd51ece1..ad4a379de 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +> ⚠️ **Archived Notice** +> This repository is no longer actively maintained. +> We have migrated development to [menloresearch/llama.cpp](https://github.com/menloresearch/llama.cpp). +> Please contribute directly to `llama.cpp` moving forward. + # Cortex

From ee9e0ad5b55142959e6fd78ccbb273b993a2ade0 Mon Sep 17 00:00:00 2001 From: Service Account Date: Fri, 4 Jul 2025 14:37:57 +0700 Subject: [PATCH 8/8] chore: update readme --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ad4a379de..f56842d29 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ -> ⚠️ **Archived Notice** -> This repository is no longer actively maintained. -> We have migrated development to [menloresearch/llama.cpp](https://github.com/menloresearch/llama.cpp). -> Please contribute directly to `llama.cpp` moving forward. +

+

🚨 Archived Repository Notice

+

This repository is no longer actively maintained.

+

Development has moved to menloresearch/llama.cpp.

+

Please contribute directly to llama.cpp moving forward.

+
# Cortex