From 7597cda5bae96b59a83e5ece87b6ec7e487cda35 Mon Sep 17 00:00:00 2001 From: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Date: Mon, 20 Oct 2025 19:15:07 +0300 Subject: [PATCH] Remove unused fields Add training in progress per dimension --- src/ml/ml.cc | 26 +++++++++++++++++++++++--- src/ml/ml_dimension.h | 10 ++++------ src/ml/ml_enums.h | 6 +++--- src/ml/ml_host.h | 26 +++++++++++++------------- src/ml/ml_kmeans.cc | 6 ++++++ src/ml/ml_public.cc | 5 +---- 6 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/ml/ml.cc b/src/ml/ml.cc index 76af589a05c9a9..8688558a8983d7 100644 --- a/src/ml/ml.cc +++ b/src/ml/ml.cc @@ -643,8 +643,6 @@ static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim) dim->suppression_anomaly_counter = 0; dim->suppression_window_counter = 0; - dim->last_training_time = rrddim_last_entry_s(dim->rd); - // Add the newly generated model to the list of pending models to flush ml_model_info_t model_info; nd_uuid_t *rd_uuid = uuidmap_uuid_ptr(dim->rd->uuid); @@ -654,6 +652,9 @@ static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim) ml_dimension_stream_kmeans(dim); + // Clear the training in progress flag + dim->training_in_progress = false; + spinlock_unlock(&dim->slock); } @@ -667,6 +668,16 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim) spinlock_unlock(&dim->slock); return ML_WORKER_RESULT_OK; } + + // Check if training is already in progress for this dimension + // If so, skip this training request to prevent concurrent access to dim->kmeans + if (dim->training_in_progress) { + spinlock_unlock(&dim->slock); + return ML_WORKER_RESULT_OK; + } + + // Mark training as in progress + dim->training_in_progress = true; spinlock_unlock(&dim->slock); auto P = ml_dimension_calculated_numbers(worker, dim); @@ -679,7 +690,7 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim) dim->mt = METRIC_TYPE_CONSTANT; dim->suppression_anomaly_counter = 0; dim->suppression_window_counter = 0; - dim->last_training_time = training_response.last_entry_on_response; + dim->training_in_progress = false; spinlock_unlock(&dim->slock); @@ -1107,6 +1118,15 @@ static enum ml_worker_result ml_worker_add_existing_model(ml_worker_t *worker, m return ML_WORKER_RESULT_OK; } + // Check if training is in progress and skip if so to avoid race condition + spinlock_lock(&Dim->slock); + if (Dim->training_in_progress) { + spinlock_unlock(&Dim->slock); + pulse_ml_models_ignored(); + return ML_WORKER_RESULT_OK; + } + spinlock_unlock(&Dim->slock); + Dim->kmeans = req.inlined_km; ml_dimension_update_models(worker, Dim); pulse_ml_models_received(); diff --git a/src/ml/ml_dimension.h b/src/ml/ml_dimension.h index c50dda10d542cf..4ce777517719f7 100644 --- a/src/ml/ml_dimension.h +++ b/src/ml/ml_dimension.h @@ -14,18 +14,16 @@ struct ml_dimension_t { enum ml_metric_type mt; enum ml_training_status ts; enum ml_machine_learning_status mls; - - time_t last_training_time; + SPINLOCK slock; + uint32_t suppression_window_counter; + uint32_t suppression_anomaly_counter; + bool training_in_progress; std::vector cns; std::vector km_contexts; - SPINLOCK slock; ml_kmeans_t kmeans; std::vector feature; - - uint32_t suppression_window_counter; - uint32_t suppression_anomaly_counter; }; bool diff --git a/src/ml/ml_enums.h b/src/ml/ml_enums.h index c84c5553a5935d..006a6172535e19 100644 --- a/src/ml/ml_enums.h +++ b/src/ml/ml_enums.h @@ -3,7 +3,7 @@ #ifndef NETDATA_ML_ENUMS_H #define NETDATA_ML_ENUMS_H -enum ml_metric_type { +enum ml_metric_type : unsigned char { // The dimension has constant values, no need to train METRIC_TYPE_CONSTANT, @@ -13,7 +13,7 @@ enum ml_metric_type { const char *ml_metric_type_to_string(enum ml_metric_type mt); -enum ml_machine_learning_status { +enum ml_machine_learning_status : unsigned char { // Enable training/prediction MACHINE_LEARNING_STATUS_ENABLED, @@ -23,7 +23,7 @@ enum ml_machine_learning_status { const char *ml_machine_learning_status_to_string(enum ml_machine_learning_status mls); -enum ml_training_status { +enum ml_training_status : unsigned char { // We don't have a model for this dimension TRAINING_STATUS_UNTRAINED, diff --git a/src/ml/ml_host.h b/src/ml/ml_host.h index aebb08d7e2c9a1..f647370db8582b 100644 --- a/src/ml/ml_host.h +++ b/src/ml/ml_host.h @@ -13,26 +13,26 @@ struct ml_queue_t; typedef struct machine_learning_stats_t { - size_t num_machine_learning_status_enabled; - size_t num_machine_learning_status_disabled_sp; + uint32_t num_machine_learning_status_enabled; + uint32_t num_machine_learning_status_disabled_sp; - size_t num_metric_type_constant; - size_t num_metric_type_variable; + uint32_t num_metric_type_constant; + uint32_t num_metric_type_variable; - size_t num_training_status_untrained; - size_t num_training_status_pending_without_model; - size_t num_training_status_trained; - size_t num_training_status_pending_with_model; - size_t num_training_status_silenced; + uint32_t num_training_status_untrained; + uint32_t num_training_status_pending_without_model; + uint32_t num_training_status_trained; + uint32_t num_training_status_pending_with_model; + uint32_t num_training_status_silenced; - size_t num_anomalous_dimensions; - size_t num_normal_dimensions; + uint32_t num_anomalous_dimensions; + uint32_t num_normal_dimensions; } ml_machine_learning_stats_t; typedef struct { RRDDIM *rd; - size_t normal_dimensions; - size_t anomalous_dimensions; + uint32_t normal_dimensions; + uint32_t anomalous_dimensions; } ml_context_anomaly_rate_t; typedef struct { diff --git a/src/ml/ml_kmeans.cc b/src/ml/ml_kmeans.cc index 1e585d44c8e818..3f8958fc39d2df 100644 --- a/src/ml/ml_kmeans.cc +++ b/src/ml/ml_kmeans.cc @@ -37,6 +37,12 @@ ml_kmeans_train(ml_kmeans_t *kmeans, const ml_features_t *features, unsigned max return; } + // Reserve capacity for cluster centers BEFORE calling dlib functions to prevent + // reallocation during lazy evaluation. dlib uses expression templates that hold + // references to vector elements, and reallocation would invalidate those references, + // causing heap-use-after-free when multiple threads train models concurrently. + //kmeans->cluster_centers.reserve(2); + dlib::pick_initial_centers(2, kmeans->cluster_centers, features->preprocessed_features); dlib::find_clusters_using_kmeans(features->preprocessed_features, kmeans->cluster_centers, max_iters); diff --git a/src/ml/ml_public.cc b/src/ml/ml_public.cc index 88c0b491ca37d2..55103d4040f4c9 100644 --- a/src/ml/ml_public.cc +++ b/src/ml/ml_public.cc @@ -109,9 +109,6 @@ void ml_host_stop(RRDHOST *rh) { dim->mt = METRIC_TYPE_CONSTANT; dim->ts = TRAINING_STATUS_UNTRAINED; - // TODO: Check if we can remove this field. - dim->last_training_time = 0; - dim->suppression_anomaly_counter = 0; dim->suppression_window_counter = 0; dim->cns.clear(); @@ -273,9 +270,9 @@ void ml_dimension_new(RRDDIM *rd) dim->mt = METRIC_TYPE_CONSTANT; dim->ts = TRAINING_STATUS_UNTRAINED; - dim->last_training_time = 0; dim->suppression_anomaly_counter = 0; dim->suppression_window_counter = 0; + dim->training_in_progress = false; ml_kmeans_init(&dim->kmeans);