🚨 Archived Repository Notice

This repository is no longer actively maintained.

Development has moved to menloresearch/llama.cpp.

Please contribute directly to llama.cpp moving forward.

diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 2bba11a7b..74bf0d1b8 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -1,6 +1,9 @@ #include "local_engine.h" +#include #include +#include #include +#include #include #include "utils/curl_utils.h" #include "utils/json_helper.h" @@ -34,6 +37,7 @@ const std::unordered_map kParamsMap = { {"dynatemp_exponent", "--dynatemp-exp"}, {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, + {"reasoning_budget", "--reasoning-budget"}, }; int GenerateRandomInteger(int min, int max) { @@ -42,18 +46,26 @@ int GenerateRandomInteger(int min, int max) { std::uniform_int_distribution<> dis( min, max); // Distribution for the desired range - return dis(gen); // Generate and return a random integer within the range + return dis(gen); } std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::vector res; std::string errors; + res.push_back("--no-webui"); for (const auto& member : root.getMemberNames()) { if (member == "model_path" || member == "llama_model_path") { if (!root[member].isNull()) { + const std::string path = root[member].asString(); res.push_back("--model"); - res.push_back(root[member].asString()); + res.push_back(path); + + // If path contains both "Jan" and "nano", case-insensitive, add special params + std::string lowered = path; + std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { + return std::tolower(c); + }); } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { @@ -85,8 +97,15 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--ignore_eos"); } continue; + } else if (member == "ctx_len") { + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; } + // Generic handling for other members res.push_back("--" + member); if (root[member].isString()) { res.push_back(root[member].asString()); @@ -105,7 +124,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { ss << "\"" << value.asString() << "\""; first = false; } - ss << "] "; + ss << "]"; res.push_back(ss.str()); } } @@ -113,6 +132,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { return res; } + constexpr const auto kMinDataChunkSize = 6u; struct OaiInfo { @@ -561,8 +581,6 @@ void LocalEngine::LoadModel(std::shared_ptr json_body, params.push_back("--port"); params.push_back(std::to_string(s.port)); - params.push_back("--pooling"); - params.push_back("mean"); params.push_back("--jinja"); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 68f0fe070..a3771e0a1 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr db_service, download_service_{download_service}, inference_svc_(inference_service), engine_svc_(engine_svc), - task_queue_(task_queue) { - // ProcessBgrTasks(); + task_queue_(task_queue){ + // ProcessBgrTasks(); }; void ModelService::ForceIndexingModelList() { @@ -500,13 +500,10 @@ cpp::result ModelService::DeleteModel( std::filesystem::remove(yaml_fp); CTL_INF("Removed: " << yaml_fp.string()); } else { - // Remove yaml files - for (const auto& entry : - std::filesystem::directory_iterator(yaml_fp.parent_path())) { - if (entry.is_regular_file() && (entry.path().extension() == ".yml")) { - std::filesystem::remove(entry); - CTL_INF("Removed: " << entry.path().string()); - } + // Is a local model - Remove only this model's yaml file + if (std::filesystem::exists(yaml_fp)) { + std::filesystem::remove(yaml_fp); + CTL_INF("Removed: " << yaml_fp.string()); } } @@ -557,6 +554,8 @@ cpp::result ModelService::StartModel( if (auto& o = params_override["ctx_len"]; !o.isNull()) { ctx_len = o.asInt(); } + Json::Value model_load_params; + json_helper::MergeJson(model_load_params, params_override); try { constexpr const int kDefautlContextLength = 8192; @@ -627,9 +626,14 @@ cpp::result ModelService::StartModel( #if defined(_WIN32) json_data["model_path"] = cortex::wc::WstringToUtf8( fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); + model_load_params["model_path"] = + cortex::wc::WstringToUtf8( + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); #else json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); #endif } else { LOG_WARN << "model_path is empty"; @@ -642,6 +646,8 @@ cpp::result ModelService::StartModel( #else json_data["mmproj"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); #endif } json_data["system_prompt"] = mc.system_template; @@ -655,6 +661,7 @@ cpp::result ModelService::StartModel( } json_data["model"] = model_handle; + model_load_params["model"] = model_handle; if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) { auto parse_prompt_result = string_utils::ParsePrompt(cpt.value()); json_data["system_prompt"] = parse_prompt_result.system_prompt; @@ -662,8 +669,6 @@ cpp::result ModelService::StartModel( json_data["ai_prompt"] = parse_prompt_result.ai_prompt; } - json_helper::MergeJson(json_data, params_override); - // Set default cpu_threads if it is not configured if (!json_data.isMember("cpu_threads")) { json_data["cpu_threads"] = GetCpuThreads(); @@ -686,12 +691,12 @@ cpp::result ModelService::StartModel( assert(!!inference_svc_); - auto ir = - inference_svc_->LoadModel(std::make_shared(json_data)); + auto ir = inference_svc_->LoadModel( + std::make_shared(model_load_params)); auto status = std::get<0>(ir)["status_code"].asInt(); auto data = std::get<1>(ir); - if (status == drogon::k200OK) { + if (status == drogon::k200OK) { return StartModelResult{/* .success = */ true, /* .warning = */ may_fallback_res.value()}; } else if (status == drogon::k409Conflict) { @@ -1031,13 +1036,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, auto es = hardware::EstimateLLaMACppRun(model_path, rc); if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB - << ", available: " << free_vram_MiB); + CTL_WRN("Not enough VRAM - " + << "required: " << (*es).gpu_mode.vram_MiB + << ", available: " << free_vram_MiB); } if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB - << ", available: " << free_ram_MiB); + CTL_WRN("Not enough RAM - " + << "required: " << (*es).cpu_mode.ram_MiB + << ", available: " << free_ram_MiB); } return warning;