11#include " server-common.h"
22#include " server-models.h"
3+ #include " server-context.h"
34
45#include " build-info.h"
56#include " preset.h"
@@ -44,9 +45,7 @@ extern char **environ;
4445#define DEFAULT_STOP_TIMEOUT 10 // seconds
4546
4647#define CMD_ROUTER_TO_CHILD_EXIT " cmd_router_to_child:exit"
47- #define CMD_CHILD_TO_ROUTER_READY " cmd_child_to_router:ready" // also sent when waking up from sleep
48- #define CMD_CHILD_TO_ROUTER_SLEEP " cmd_child_to_router:sleep"
49- #define CMD_CHILD_TO_ROUTER_INFO " cmd_child_to_router:info:" // followed by json string
48+ #define CMD_CHILD_TO_ROUTER_STATE " cmd_child_to_router:state:" // followed by json string
5049
5150// address for child process, this is needed because router may run on 0.0.0.0
5251// ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -904,12 +903,8 @@ void server_models::load(const std::string & name) {
904903 while (fgets (buffer, vec_buf.size (), stdout_file) != nullptr ) {
905904 LOG (" [%5d] %s" , port, buffer);
906905 std::string str (buffer);
907- if (string_starts_with (buffer, CMD_CHILD_TO_ROUTER_READY )) {
908- this ->update_status (name, SERVER_MODEL_STATUS_LOADED , 0 );
909- } else if (string_starts_with (buffer, CMD_CHILD_TO_ROUTER_INFO )) {
910- this ->update_loaded_info (name, str);
911- } else if (string_starts_with (buffer, CMD_CHILD_TO_ROUTER_SLEEP )) {
912- this ->update_status (name, SERVER_MODEL_STATUS_SLEEPING , 0 );
906+ if (string_starts_with (buffer, CMD_CHILD_TO_ROUTER_STATE )) {
907+ this ->handle_child_state (name, str);
913908 }
914909 }
915910 } else {
@@ -976,7 +971,10 @@ void server_models::load(const std::string & name) {
976971 subprocess_destroy (&child_proc->get ());
977972
978973 // update status and exit code
979- this ->update_status (name, SERVER_MODEL_STATUS_UNLOADED , exit_code);
974+ this ->update_status (name, {
975+ SERVER_MODEL_STATUS_UNLOADED ,
976+ exit_code
977+ });
980978 SRV_INF (" instance name=%s exited with status %d\n " , name.c_str (), exit_code);
981979 });
982980
@@ -1016,7 +1014,8 @@ struct server_models_download_res : public common_download_callback {
10161014 common_download_model (model, opts);
10171015 is_ok = true ;
10181016 } catch (const std::exception & e) {
1019- SRV_ERR (" download failed for model name=%s: %s\n " , model.name .c_str (), e.what ());
1017+ auto model_name = model.get_name ();
1018+ SRV_ERR (" download failed for model name=%s: %s\n " , model_name.c_str (), e.what ());
10201019 is_ok = false ;
10211020 }
10221021 return is_ok;
@@ -1036,7 +1035,7 @@ struct server_models_download_res : public common_download_callback {
10361035};
10371036
10381037void server_models::download (common_params_model && model, common_download_opts && opts) {
1039- std::string name = model.name ;
1038+ std::string name = model.get_name () ;
10401039 GGML_ASSERT (name == model.hf_repo );
10411040
10421041 std::unique_lock<std::mutex> lk (mutex);
@@ -1064,9 +1063,10 @@ void server_models::download(common_params_model && model, common_download_opts
10641063 inst.th = std::thread ([this , dl = std::move (dl)]() {
10651064 dl->opts .callback = dl.get ();
10661065 bool ok = dl->run ();
1066+ auto model_name = dl->model .get_name ();
10671067 SRV_INF (" download finished for model name=%s with status=%s\n " ,
1068- dl-> model . name .c_str (), ok ? " success" : " failure" );
1069- update_download_progress (dl-> model . name , {}, true , ok);
1068+ model_name .c_str (), ok ? " success" : " failure" );
1069+ update_download_progress (model_name , {}, true , ok);
10701070 // need_reload is set inside update_download_progress under the mutex;
10711071 // the next load_models() call will clean up this instance
10721072 });
@@ -1130,51 +1130,34 @@ void server_models::unload_all() {
11301130 }
11311131}
11321132
1133- void server_models::update_status (const std::string & name, server_model_status status, int exit_code ) {
1133+ void server_models::update_status (const std::string & name, const update_status_args & args ) {
11341134 std::unique_lock<std::mutex> lk (mutex);
11351135 auto it = mapping.find (name);
11361136 if (it != mapping.end ()) {
11371137 auto & meta = it->second .meta ;
1138- meta.status = status;
1139- meta.exit_code = exit_code;
1138+ meta.status = args.status ;
1139+ meta.exit_code = args.exit_code ;
1140+ if (!args.loaded_info .is_null ()) {
1141+ meta.loaded_info = args.loaded_info ;
1142+ }
11401143 }
11411144 // broadcast status change to SSE
11421145 {
11431146 json data = {
1144- {" status" , server_model_status_to_string (status)},
1147+ {" status" , server_model_status_to_string (args. status )},
11451148 };
1146- if (status == SERVER_MODEL_STATUS_UNLOADED ) {
1147- data[" exit_code" ] = exit_code;
1149+ if (args.status == SERVER_MODEL_STATUS_UNLOADED ) {
1150+ data[" exit_code" ] = args.exit_code ;
1151+ }
1152+ if (!args.loaded_info .is_null ()) {
1153+ data[" info" ] = args.loaded_info ;
11481154 }
11491155 // note: notify_sse doesn't acquire the lock, so no deadlock here
11501156 notify_sse (" status_change" , name, data);
11511157 }
11521158 cv.notify_all ();
11531159}
11541160
1155- void server_models::update_loaded_info (const std::string & name, std::string & raw_info) {
1156- if (!string_starts_with (raw_info, CMD_CHILD_TO_ROUTER_INFO )) {
1157- SRV_WRN (" invalid loaded info format from child for model name=%s: %s\n " , name.c_str (), raw_info.c_str ());
1158- return ;
1159- }
1160-
1161- json info;
1162- try {
1163- info = json::parse (raw_info.substr (strlen (CMD_CHILD_TO_ROUTER_INFO )));
1164- } catch (const std::exception & e) {
1165- SRV_WRN (" failed to parse loaded info from child for model name=%s: %s\n " , name.c_str (), e.what ());
1166- return ;
1167- }
1168-
1169- std::unique_lock<std::mutex> lk (mutex);
1170- auto it = mapping.find (name);
1171- if (it != mapping.end ()) {
1172- auto & meta = it->second .meta ;
1173- meta.loaded_info = info;
1174- }
1175- cv.notify_all ();
1176- }
1177-
11781161void server_models::update_download_progress (const std::string & name, const common_download_progress & progress, bool done, bool ok) {
11791162 json curr;
11801163 {
@@ -1323,21 +1306,54 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
13231306 return proxy;
13241307}
13251308
1326- bool server_models::is_child_server () {
1309+ void server_models::handle_child_state (const std::string & name, const std::string & raw_input) {
1310+ server_state state;
1311+ json payload;
1312+
1313+ try {
1314+ json data = json::parse (raw_input.substr (strlen (CMD_CHILD_TO_ROUTER_STATE )));
1315+ state = server_state_from_str (json_value (data, " state" , std::string ()));
1316+ payload = json_value (data, " payload" , json{});
1317+ } catch (const std::exception & e) {
1318+ SRV_ERR (" failed to parse child state update for name=%s: %s\n " , name.c_str (), e.what ());
1319+ return ;
1320+ }
1321+
1322+ switch (state) {
1323+ case SERVER_STATE_LOADING :
1324+ {
1325+ // do nothing for now
1326+ // TODO: report loading progress for first load and wakeup from sleep
1327+ } break ;
1328+ case SERVER_STATE_READY :
1329+ {
1330+ update_status (name, {
1331+ SERVER_MODEL_STATUS_LOADED ,
1332+ 0 ,
1333+ // note: payload can be empty if this is a wakeup from sleep
1334+ payload.size () > 0 ? payload : nullptr
1335+ });
1336+ } break ;
1337+ case SERVER_STATE_SLEEPING :
1338+ {
1339+ update_status (name, { SERVER_MODEL_STATUS_SLEEPING });
1340+ } break ;
1341+ default :
1342+ // should never happen, but just in case
1343+ GGML_ASSERT (false && " unexpected state from child server" );
1344+ }
1345+ }
1346+
1347+ //
1348+ // server_child
1349+ //
1350+
1351+ bool server_child::is_child () {
13271352 const char * router_port = std::getenv (" LLAMA_SERVER_ROUTER_PORT" );
13281353 return router_port != nullptr ;
13291354}
13301355
1331- std::thread server_models::setup_child_server (const std::function<void (int )> & shutdown_handler, const json & model_info) {
1332- // send a notification to the router server that a model instance is ready
1333- common_log_pause (common_log_main ());
1334- fflush (stdout);
1335- fprintf (stdout, " %s\n " , CMD_CHILD_TO_ROUTER_READY );
1336- fflush (stdout);
1337- fprintf (stdout, " %s%s\n " , CMD_CHILD_TO_ROUTER_INFO , safe_json_to_str (model_info).c_str ());
1338- fflush (stdout);
1339- common_log_resume (common_log_main ());
1340-
1356+ std::thread server_child::setup (const std::function<void (int )> & shutdown_handler) {
13411357 // setup thread for monitoring stdin
13421358 return std::thread ([shutdown_handler]() {
13431359 // wait for EOF on stdin
@@ -1363,10 +1379,14 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
13631379 });
13641380}
13651381
1366- void server_models::notify_router_sleeping_state (bool is_sleeping) {
1382+ void server_child::notify_to_router (const std::string & state, const json & payload) {
1383+ json data = {
1384+ {" state" , state},
1385+ {" payload" , payload},
1386+ };
13671387 common_log_pause (common_log_main ());
13681388 fflush (stdout);
1369- fprintf (stdout, " %s\n " , is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY );
1389+ fprintf (stdout, " %s%s \n " , CMD_CHILD_TO_ROUTER_STATE , safe_json_to_str (data). c_str () );
13701390 fflush (stdout);
13711391 common_log_resume (common_log_main ());
13721392}
@@ -1644,7 +1664,6 @@ void server_models_routes::init_routes() {
16441664 common_params_model model;
16451665 common_download_opts opts;
16461666
1647- model.name = name;
16481667 model.hf_repo = name;
16491668 opts.bearer_token = params.hf_token ;
16501669 opts.download_mmproj = true ;
0 commit comments