From 562b56dd322009b3cb7f2e55383b632886a2cd67 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Tue, 13 May 2025 13:40:52 +0200 Subject: [PATCH 1/3] feat: fetch prebuilds metrics state in background Signed-off-by: Danny Kopping --- .../coderd/prebuilds/metricscollector.go | 91 +++++++++++++++---- .../coderd/prebuilds/metricscollector_test.go | 5 + enterprise/coderd/prebuilds/reconcile.go | 5 + 3 files changed, 84 insertions(+), 17 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 7b55227effffa..3452f9af0426b 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -2,11 +2,13 @@ package prebuilds import ( "context" + "sync/atomic" "time" - "cdr.dev/slog" - "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" + + "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" @@ -57,18 +59,27 @@ var ( ) ) +const ( + metricsUpdateInterval = time.Second * 15 + metricsUpdateTimeout = time.Second * 10 +) + type MetricsCollector struct { database database.Store logger slog.Logger snapshotter prebuilds.StateSnapshotter + + latestState atomic.Pointer[state] } var _ prometheus.Collector = new(MetricsCollector) +// NewMetricsCollector returns a func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { + log := logger.Named("prebuilds_metrics_collector") return &MetricsCollector{ database: db, - logger: logger.Named("prebuilds_metrics_collector"), + logger: log, snapshotter: snapshotter, } } @@ -82,34 +93,31 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- eligiblePrebuildsDesc } +// Collect uses the cached state to set configured metrics. +// The state is cached because this function can be called multiple times per second and retrieving the current state +// is an expensive operation. func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { // nolint:gocritic // We need to set an authz context to read metrics from the db. - ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) - defer cancel() - prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) - if err != nil { - mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) + ctx := dbauthz.AsPrebuildsOrchestrator(context.Background()) + + currentState := mc.latestState.Load() + if currentState == nil { + mc.logger.Warn(ctx, "failed to set prebuilds metrics; state not set") return } - for _, metric := range prebuildMetrics { + for _, metric := range currentState.prebuildMetrics { metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) } - snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database) - if err != nil { - mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err)) - return - } - - for _, preset := range snapshot.Presets { + for _, preset := range currentState.snapshot.Presets { if !preset.UsingActiveVersion { continue } - presetSnapshot, err := snapshot.FilterByPreset(preset.ID) + presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) if err != nil { mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) continue @@ -121,3 +129,52 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } } + +type state struct { + prebuildMetrics []database.GetPrebuildMetricsRow + snapshot *prebuilds.GlobalSnapshot +} + +// BackgroundFetch updates the metrics state every given interval. +func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) { + tick := time.NewTicker(time.Nanosecond) + defer tick.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-tick.C: + // Tick immediately, then set regular interval. + tick.Reset(updateInterval) + + if err := mc.UpdateState(ctx, updateTimeout); err != nil { + mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err)) + } + } + } +} + +// UpdateState builds the current metrics state. +func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { + mc.logger.Debug(ctx, "fetching prebuilds metrics state") + fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) + defer fetchCancel() + + prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx) + if err != nil { + return xerrors.Errorf("fetch prebuild metrics: %w", err) + } + + snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database) + if err != nil { + return xerrors.Errorf("snapshot state: %w", err) + } + mc.logger.Debug(ctx, "fetched prebuilds metrics state") + + mc.latestState.Store(&state{ + prebuildMetrics: prebuildMetrics, + snapshot: snapshot, + }) + return nil +} diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 859509ced6635..de3f5d017f715 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -16,6 +16,7 @@ import ( "github.com/coder/quartz" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" @@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) { setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible) } + // Force an update to the metrics state to allow the collector to collect fresh metrics. + // nolint:gocritic // Authz context needed to retrieve state. + require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong)) + metricsFamilies, err := registry.Gather() require.NoError(t, err) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index c31da695637ba..ca02ee4218678 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -97,6 +97,11 @@ func (c *StoreReconciler) Run(ctx context.Context) { ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx)) c.cancelFn = cancel + // Start updating metrics in the background. + if c.metrics != nil { + go c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout) + } + // Everything is in place, reconciler can now be considered as running. // // NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above. From fcbfb7fed2dc80bb2918f857134c3dfc8b1e632b Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Tue, 13 May 2025 14:10:15 +0200 Subject: [PATCH 2/3] chore: improvements Signed-off-by: Danny Kopping --- .../coderd/prebuilds/metricscollector.go | 26 ++++++++++++------- enterprise/coderd/prebuilds/reconcile.go | 9 ++++++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 3452f9af0426b..c7ee95a04d787 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -2,6 +2,7 @@ package prebuilds import ( "context" + "fmt" "sync/atomic" "time" @@ -11,7 +12,6 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/prebuilds" ) @@ -57,6 +57,12 @@ var ( labels, nil, ) + lastUpdateDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_metrics_last_updated", + "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", + []string{}, + nil, + ) ) const ( @@ -74,7 +80,6 @@ type MetricsCollector struct { var _ prometheus.Collector = new(MetricsCollector) -// NewMetricsCollector returns a func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { log := logger.Named("prebuilds_metrics_collector") return &MetricsCollector{ @@ -91,18 +96,16 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc + descCh <- lastUpdateDesc } // Collect uses the cached state to set configured metrics. // The state is cached because this function can be called multiple times per second and retrieving the current state // is an expensive operation. func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { - // nolint:gocritic // We need to set an authz context to read metrics from the db. - ctx := dbauthz.AsPrebuildsOrchestrator(context.Background()) - - currentState := mc.latestState.Load() + currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. if currentState == nil { - mc.logger.Warn(ctx, "failed to set prebuilds metrics; state not set") + mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") return } @@ -119,7 +122,7 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) if err != nil { - mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) + mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) continue } state := presetSnapshot.CalculateState() @@ -128,11 +131,14 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } + + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } type state struct { prebuildMetrics []database.GetPrebuildMetricsRow snapshot *prebuilds.GlobalSnapshot + createdAt time.Time } // BackgroundFetch updates the metrics state every given interval. @@ -157,6 +163,7 @@ func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, // UpdateState builds the current metrics state. func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { + start := time.Now() mc.logger.Debug(ctx, "fetching prebuilds metrics state") fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) defer fetchCancel() @@ -170,11 +177,12 @@ func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Durati if err != nil { return xerrors.Errorf("snapshot state: %w", err) } - mc.logger.Debug(ctx, "fetched prebuilds metrics state") + mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) mc.latestState.Store(&state{ prebuildMetrics: prebuildMetrics, snapshot: snapshot, + createdAt: time.Now(), }) return nil } diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index ca02ee4218678..df0007246bdc6 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "math" + "sync" "sync/atomic" "time" @@ -87,10 +88,12 @@ func (c *StoreReconciler) Run(ctx context.Context) { slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()), slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String())) + var wg sync.WaitGroup ticker := c.clock.NewTicker(reconciliationInterval) defer ticker.Stop() defer func() { c.done <- struct{}{} + wg.Wait() }() // nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions. @@ -99,7 +102,11 @@ func (c *StoreReconciler) Run(ctx context.Context) { // Start updating metrics in the background. if c.metrics != nil { - go c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout) + wg.Add(1) + go func() { + defer wg.Done() + c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout) + }() } // Everything is in place, reconciler can now be considered as running. From 35cebf6c5fc314d26de89459a9308f2e1c2452ca Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Tue, 13 May 2025 17:03:07 +0200 Subject: [PATCH 3/3] chore: review feedback Signed-off-by: Danny Kopping --- enterprise/coderd/prebuilds/metricscollector.go | 11 ++++++----- enterprise/coderd/prebuilds/reconcile.go | 12 +++++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index c7ee95a04d787..76089c025243d 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -12,6 +12,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/prebuilds" ) @@ -75,7 +76,7 @@ type MetricsCollector struct { logger slog.Logger snapshotter prebuilds.StateSnapshotter - latestState atomic.Pointer[state] + latestState atomic.Pointer[metricsState] } var _ prometheus.Collector = new(MetricsCollector) @@ -106,6 +107,7 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. if currentState == nil { mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0) return } @@ -135,7 +137,7 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } -type state struct { +type metricsState struct { prebuildMetrics []database.GetPrebuildMetricsRow snapshot *prebuilds.GlobalSnapshot createdAt time.Time @@ -164,7 +166,6 @@ func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, // UpdateState builds the current metrics state. func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { start := time.Now() - mc.logger.Debug(ctx, "fetching prebuilds metrics state") fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) defer fetchCancel() @@ -179,10 +180,10 @@ func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Durati } mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) - mc.latestState.Store(&state{ + mc.latestState.Store(&metricsState{ prebuildMetrics: prebuildMetrics, snapshot: snapshot, - createdAt: time.Now(), + createdAt: dbtime.Now(), }) return nil } diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index df0007246bdc6..79a8baa337e72 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -68,10 +68,12 @@ func NewStoreReconciler(store database.Store, provisionNotifyCh: make(chan database.ProvisionerJob, 10), } - reconciler.metrics = NewMetricsCollector(store, logger, reconciler) - if err := registerer.Register(reconciler.metrics); err != nil { - // If the registerer fails to register the metrics collector, it's not fatal. - logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + if registerer != nil { + reconciler.metrics = NewMetricsCollector(store, logger, reconciler) + if err := registerer.Register(reconciler.metrics); err != nil { + // If the registerer fails to register the metrics collector, it's not fatal. + logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + } } return reconciler @@ -92,8 +94,8 @@ func (c *StoreReconciler) Run(ctx context.Context) { ticker := c.clock.NewTicker(reconciliationInterval) defer ticker.Stop() defer func() { - c.done <- struct{}{} wg.Wait() + c.done <- struct{}{} }() // nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.