Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions src/ml/ml.cc
Original file line number Diff line number Diff line change
Expand Up @@ -643,8 +643,6 @@ static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim)
dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;

dim->last_training_time = rrddim_last_entry_s(dim->rd);

// Add the newly generated model to the list of pending models to flush
ml_model_info_t model_info;
nd_uuid_t *rd_uuid = uuidmap_uuid_ptr(dim->rd->uuid);
Expand All @@ -654,6 +652,9 @@ static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim)

ml_dimension_stream_kmeans(dim);

// Clear the training in progress flag
dim->training_in_progress = false;

spinlock_unlock(&dim->slock);
}

Expand All @@ -667,6 +668,16 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim)
spinlock_unlock(&dim->slock);
return ML_WORKER_RESULT_OK;
}

// Check if training is already in progress for this dimension
// If so, skip this training request to prevent concurrent access to dim->kmeans
if (dim->training_in_progress) {
spinlock_unlock(&dim->slock);
return ML_WORKER_RESULT_OK;
}

// Mark training as in progress
dim->training_in_progress = true;
spinlock_unlock(&dim->slock);

auto P = ml_dimension_calculated_numbers(worker, dim);
Expand All @@ -679,7 +690,7 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim)
dim->mt = METRIC_TYPE_CONSTANT;
dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;
dim->last_training_time = training_response.last_entry_on_response;
dim->training_in_progress = false;

spinlock_unlock(&dim->slock);

Expand Down Expand Up @@ -1107,6 +1118,15 @@ static enum ml_worker_result ml_worker_add_existing_model(ml_worker_t *worker, m
return ML_WORKER_RESULT_OK;
}

// Check if training is in progress and skip if so to avoid race condition
spinlock_lock(&Dim->slock);
if (Dim->training_in_progress) {
spinlock_unlock(&Dim->slock);
pulse_ml_models_ignored();
return ML_WORKER_RESULT_OK;
}
spinlock_unlock(&Dim->slock);

Dim->kmeans = req.inlined_km;
ml_dimension_update_models(worker, Dim);
pulse_ml_models_received();
Expand Down
10 changes: 4 additions & 6 deletions src/ml/ml_dimension.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,16 @@ struct ml_dimension_t {
enum ml_metric_type mt;
enum ml_training_status ts;
enum ml_machine_learning_status mls;

time_t last_training_time;
SPINLOCK slock;
uint32_t suppression_window_counter;
uint32_t suppression_anomaly_counter;
bool training_in_progress;

std::vector<calculated_number_t> cns;

std::vector<ml_kmeans_inlined_t> km_contexts;
SPINLOCK slock;
ml_kmeans_t kmeans;
std::vector<DSample> feature;

uint32_t suppression_window_counter;
uint32_t suppression_anomaly_counter;
};

bool
Expand Down
6 changes: 3 additions & 3 deletions src/ml/ml_enums.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#ifndef NETDATA_ML_ENUMS_H
#define NETDATA_ML_ENUMS_H

enum ml_metric_type {
enum ml_metric_type : unsigned char {
// The dimension has constant values, no need to train
METRIC_TYPE_CONSTANT,

Expand All @@ -13,7 +13,7 @@ enum ml_metric_type {

const char *ml_metric_type_to_string(enum ml_metric_type mt);

enum ml_machine_learning_status {
enum ml_machine_learning_status : unsigned char {
// Enable training/prediction
MACHINE_LEARNING_STATUS_ENABLED,

Expand All @@ -23,7 +23,7 @@ enum ml_machine_learning_status {

const char *ml_machine_learning_status_to_string(enum ml_machine_learning_status mls);

enum ml_training_status {
enum ml_training_status : unsigned char {
// We don't have a model for this dimension
TRAINING_STATUS_UNTRAINED,

Expand Down
26 changes: 13 additions & 13 deletions src/ml/ml_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,26 @@
struct ml_queue_t;

typedef struct machine_learning_stats_t {
size_t num_machine_learning_status_enabled;
size_t num_machine_learning_status_disabled_sp;
uint32_t num_machine_learning_status_enabled;
uint32_t num_machine_learning_status_disabled_sp;

size_t num_metric_type_constant;
size_t num_metric_type_variable;
uint32_t num_metric_type_constant;
uint32_t num_metric_type_variable;

size_t num_training_status_untrained;
size_t num_training_status_pending_without_model;
size_t num_training_status_trained;
size_t num_training_status_pending_with_model;
size_t num_training_status_silenced;
uint32_t num_training_status_untrained;
uint32_t num_training_status_pending_without_model;
uint32_t num_training_status_trained;
uint32_t num_training_status_pending_with_model;
uint32_t num_training_status_silenced;

size_t num_anomalous_dimensions;
size_t num_normal_dimensions;
uint32_t num_anomalous_dimensions;
uint32_t num_normal_dimensions;
} ml_machine_learning_stats_t;

typedef struct {
RRDDIM *rd;
size_t normal_dimensions;
size_t anomalous_dimensions;
uint32_t normal_dimensions;
uint32_t anomalous_dimensions;
} ml_context_anomaly_rate_t;

typedef struct {
Expand Down
6 changes: 6 additions & 0 deletions src/ml/ml_kmeans.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ ml_kmeans_train(ml_kmeans_t *kmeans, const ml_features_t *features, unsigned max
return;
}

// Reserve capacity for cluster centers BEFORE calling dlib functions to prevent
// reallocation during lazy evaluation. dlib uses expression templates that hold
// references to vector elements, and reallocation would invalidate those references,
// causing heap-use-after-free when multiple threads train models concurrently.
//kmeans->cluster_centers.reserve(2);

dlib::pick_initial_centers(2, kmeans->cluster_centers, features->preprocessed_features);
dlib::find_clusters_using_kmeans(features->preprocessed_features, kmeans->cluster_centers, max_iters);

Expand Down
5 changes: 1 addition & 4 deletions src/ml/ml_public.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ void ml_host_stop(RRDHOST *rh) {
dim->mt = METRIC_TYPE_CONSTANT;
dim->ts = TRAINING_STATUS_UNTRAINED;

// TODO: Check if we can remove this field.
dim->last_training_time = 0;

dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;
dim->cns.clear();
Expand Down Expand Up @@ -273,9 +270,9 @@ void ml_dimension_new(RRDDIM *rd)

dim->mt = METRIC_TYPE_CONSTANT;
dim->ts = TRAINING_STATUS_UNTRAINED;
dim->last_training_time = 0;
dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;
dim->training_in_progress = false;

ml_kmeans_init(&dim->kmeans);

Expand Down
Loading