Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
15fe52c
Refactor pluginsd dimension array handling to use reference-counted s…
stelfrag Jan 24, 2026
f5cb521
Fix compilation (add missing file)
stelfrag Jan 30, 2026
9e345ae
Add PRD_ARRAY refcount stress test and improve thread safety with spi…
stelfrag Jan 30, 2026
414253b
Improve thread safety and optimize cleanup logic for PRD_ARRAY. Use `…
stelfrag Jan 30, 2026
458ee12
Ensure the collector is fully stopped before cleanup.
stelfrag Jan 30, 2026
25ae6b5
Code cleanup
stelfrag Jan 30, 2026
d5590c3
Cleanup: use sleep_usec()
stelfrag Jan 30, 2026
a1f00ab
Ensure proper cleanup of `pluginsd_chart_slots` when collector is active
stelfrag Feb 11, 2026
127cf42
Improve thread safety: avoid clearing `collector_tid` when `old_st ==…
stelfrag Feb 20, 2026
52db754
Improve thread safety and cleanup logic for PRD_ARRAY and chart slots:
stelfrag Feb 21, 2026
083e645
database: extract chart slot mapping cleanup helper
stelfrag Feb 21, 2026
ea6c91c
pluginsd: harden collector_tid and PRD array cleanup handoff
stelfrag Feb 21, 2026
d0462d5
- Add null checks for `rda` to prevent dereferencing invalid pointers.
stelfrag Mar 2, 2026
d14fd07
Add null checks for `prd->id` and `arr->entries[t].id` in pluginsd in…
stelfrag Mar 2, 2026
7cf2f49
Address review comments
stelfrag Mar 2, 2026
3eb727c
Address review comments part 2
stelfrag Mar 2, 2026
903a301
- Replace `memcpy` with explicit per-field copying to avoid stale `rd…
stelfrag Mar 2, 2026
1e6eb16
Add spinlock protection during slot cache growth to ensure concurrenc…
stelfrag Mar 2, 2026
3fca1dc
Improve memory tracking and spinlock-protected lifecycle handling for…
stelfrag Mar 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/daemon/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "web/mcp/mcp.h"

#include "database/engine/page_test.h"
#include "database/rrdset-slots.h"
#include <curl/curl.h>

#ifdef OS_WINDOWS
Expand Down Expand Up @@ -154,6 +155,7 @@ int help(int exitcode) {
" size of E MiB, an optional disk space limit\n"
" of F MiB, G libuv workers (default 16) and exit.\n\n"
#endif
" -W prd-array-stress Run PRD_ARRAY refcount stress test and exit.\n\n"
" -W set section option value\n"
" set netdata.conf option from the command line.\n\n"
" -W buildinfo Print the version, the configure options,\n"
Expand Down Expand Up @@ -453,6 +455,10 @@ int netdata_main(int argc, char **argv) {
unittest_running = true;
return rwlocks_stress_test();
}
else if(strcmp(optarg, "prd-array-stress") == 0) {
unittest_running = true;
return prd_array_stress_test();
}
else if(strcmp(optarg, "stringtest") == 0) {
unittest_running = true;
return string_unittest(10000);
Expand Down
22 changes: 19 additions & 3 deletions src/database/rrdhost-slots.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,29 @@ void rrdhost_pluginsd_send_chart_slots_free(RRDHOST *host) {
}

void rrdhost_pluginsd_receive_chart_slots_free(RRDHOST *host) {
rrd_slot_memory_removed(host->stream.rcv.pluginsd_chart_slots.size * sizeof(uint32_t));
rrd_slot_memory_removed(host->stream.rcv.pluginsd_chart_slots.size * sizeof(RRDSET *));

spinlock_lock(&host->stream.rcv.pluginsd_chart_slots.spinlock);

if(host->stream.rcv.pluginsd_chart_slots.array) {
for (size_t s = 0; s < host->stream.rcv.pluginsd_chart_slots.size; s++)
rrdset_pluginsd_receive_unslot_and_cleanup(host->stream.rcv.pluginsd_chart_slots.array[s]);
for (size_t s = 0; s < host->stream.rcv.pluginsd_chart_slots.size; s++) {
RRDSET *st = host->stream.rcv.pluginsd_chart_slots.array[s];
if(st) {
// Clear collector_tid - the collector is already stopped
// (stream_receiver_signal_to_stop_and_wait was called before this)
// so it's safe to cleanup regardless of the previous collector_tid value
__atomic_store_n(&st->pluginsd.collector_tid, 0, __ATOMIC_RELEASE);

// Pre-clear last_slot so that rrdset_pluginsd_receive_unslot_and_cleanup
// won't try to re-acquire the host spinlock we already hold.
// This prevents recursive locking on pluginsd_chart_slots.spinlock.
// We're freeing the entire host slots array below, so clearing individual
// slot entries is unnecessary.
st->pluginsd.last_slot = -1;

rrdset_pluginsd_receive_unslot_and_cleanup(st);
}
}

freez(host->stream.rcv.pluginsd_chart_slots.array);
host->stream.rcv.pluginsd_chart_slots.array = NULL;
Expand Down
139 changes: 139 additions & 0 deletions src/database/rrdset-pluginsd-array.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-License-Identifier: GPL-3.0-or-later

#ifndef NETDATA_RRDSET_PLUGINSD_ARRAY_H
#define NETDATA_RRDSET_PLUGINSD_ARRAY_H

// This header must be included AFTER rrddim.h to get the full struct pluginsd_rrddim definition

#include "rrddim.h"

// --------------------------------------------------------------------------------------------------------------------
// Reference-counted array for pluginsd dimension caching
//
// This structure provides thread-safe access to the dimension cache array used by the pluginsd protocol.
// The reference counting ensures that the array is not freed while any thread is still using it.
//
// THREAD SAFETY - LIFECYCLE SEPARATION:
// -------------------------------------
// The design relies on collector and cleanup never running concurrently on the same chart:
//
// 1. collector_tid: Primary synchronization mechanism
// - Collector sets collector_tid BEFORE accessing the array
// - Collector clears collector_tid AFTER all operations are complete
// - Cleanup code checks collector_tid and SKIPS if non-zero
// - This allows the collector to use lock-free operations (get_unsafe, replace, release)
//
// 2. spinlock + refcount: Coordinates concurrent cleanup operations
// - prd_array_acquire(): Takes spinlock, loads pointer, increments refcount
// - Used by cleanup code when collector is NOT active
// - Prevents races between multiple cleanup threads
//
// 3. Lifecycle guarantee: In production, cleanup only runs when:
// - Stream receiver is stopped (collector thread terminated)
// - collector_tid is explicitly cleared before cleanup
// - Therefore, collector's replace+release never races with cleanup's acquire
//
// HOT PATH (collector active, collector_tid set): Lock-free
// CLEANUP PATH (collector stopped, collector_tid == 0): Uses spinlock
// --------------------------------------------------------------------------------------------------------------------

typedef struct pluginsd_rrddim_array {
int32_t refcount; // Reference count (atomic)
size_t size; // Number of entries in the array
struct pluginsd_rrddim entries[]; // Flexible array member
} PRD_ARRAY;

// --------------------------------------------------------------------------------------------------------------------
// API Functions
// --------------------------------------------------------------------------------------------------------------------

// Create a new array with the specified size and refcount=1
static inline PRD_ARRAY *prd_array_create(size_t size) {
PRD_ARRAY *arr = callocz(1, sizeof(PRD_ARRAY) + size * sizeof(struct pluginsd_rrddim));
arr->refcount = 1;
arr->size = size;
rrd_slot_memory_added(sizeof(PRD_ARRAY) + size * sizeof(struct pluginsd_rrddim));
return arr;
}

// Acquire a reference to the array when spinlock is ALREADY HELD
// Returns NULL if no array exists
// The caller MUST call prd_array_release() when done
// Use this when you need to do additional checks (e.g., collector_tid) under the same spinlock
static inline PRD_ARRAY *prd_array_acquire_locked(PRD_ARRAY **array_ptr) {
PRD_ARRAY *arr = *array_ptr;
if (arr) {
__atomic_fetch_add(&arr->refcount, 1, __ATOMIC_ACQ_REL);
}
return arr;
}
Comment on lines +63 to +69
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The array pointer is updated elsewhere via __atomic_exchange_n() / __atomic_load_n(), but here it’s read via a plain load (*array_ptr). To keep the access model consistent and avoid accidental data races if lifecycle assumptions change, prefer loading the pointer with an atomic load (e.g., __atomic_load_n(array_ptr, __ATOMIC_ACQUIRE)) even when the caller holds a spinlock.

Copilot uses AI. Check for mistakes.

// Acquire a reference to the array stored in the atomic pointer location
// Returns NULL if no array exists
// The caller MUST call prd_array_release() when done
//
// IMPORTANT: Only call this when collector_tid == 0 (collector not active).
// Uses spinlock to coordinate with other cleanup operations.
static inline PRD_ARRAY *prd_array_acquire(PRD_ARRAY **array_ptr, SPINLOCK *spinlock) {
spinlock_lock(spinlock);
PRD_ARRAY *arr = prd_array_acquire_locked(array_ptr);
spinlock_unlock(spinlock);
return arr;
}

// Release a reference to the array
// If this was the last reference (refcount becomes 0), the array is freed
// Safe to call with NULL
static inline void prd_array_release(PRD_ARRAY *arr) {
if (!arr)
return;

int32_t old_refcount = __atomic_load_n(&arr->refcount, __ATOMIC_ACQUIRE);
while(true) {
if(unlikely(old_refcount <= 0)) {
// Keep the object stable and avoid driving refcount further negative on
// repeated misuse. Log in all builds; internal_fatal adds extra checks.
nd_log_limit_static_global_var(erl_prd_refcount_underflow, 1, 0);
nd_log_limit(&erl_prd_refcount_underflow, NDLS_DAEMON, NDLP_WARNING,
"PRD_ARRAY: refcount underflow (was %d) - double release detected",
old_refcount);
internal_fatal(true,
"PRD_ARRAY: refcount underflow (was %d) - double release detected", old_refcount);
return;
}

if(__atomic_compare_exchange_n(&arr->refcount, &old_refcount, old_refcount - 1,
false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE))
break;
}

if(old_refcount == 1) {
// We were the last reference - free the array
// Note: The caller is responsible for releasing any RRDDIM_ACQUIRED references
// in the entries before the final release
rrd_slot_memory_removed(sizeof(PRD_ARRAY) + arr->size * sizeof(struct pluginsd_rrddim));
freez(arr);
}
}

// Atomically replace the array pointer with a new array
// Returns the old array (caller must release it) or NULL if there was no old array
// The new_arr can be NULL to clear the array
//
// Thread safety depends on context:
// - Collector (collector_tid set): No spinlock needed - cleanup will skip
// - Cleanup (collector_tid == 0): Should hold spinlock to coordinate with other cleanup
static inline PRD_ARRAY *prd_array_replace(PRD_ARRAY **array_ptr, PRD_ARRAY *new_arr) {
return __atomic_exchange_n(array_ptr, new_arr, __ATOMIC_ACQ_REL);
}

// Get the current array without acquiring a reference (for quick NULL checks or
// when external synchronization guarantees the array won't be freed)
// WARNING: The returned pointer may become invalid at any time unless:
// - The caller holds the spinlock, OR
// - The caller is the collector thread with collector_tid set (preventing cleanup)
static inline PRD_ARRAY *prd_array_get_unsafe(PRD_ARRAY **array_ptr) {
return __atomic_load_n(array_ptr, __ATOMIC_ACQUIRE);
}

#endif // NETDATA_RRDSET_PLUGINSD_ARRAY_H
Loading
Loading