Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: fetch prebuilds metrics state in background #17792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 86 additions & 20 deletions enterprise/coderd/prebuilds/metricscollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ package prebuilds

import (
"context"
"fmt"
"sync/atomic"
"time"

"cdr.dev/slog"

"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"

"cdr.dev/slog"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/prebuilds"
)

Expand Down Expand Up @@ -55,20 +58,34 @@ var (
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_metrics_last_updated",
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is unix timestamp easy to alert on? Like can you do something like unix_now() - metric_value > 1000 or something in grafana and co? If not, it might be better if this was a duration since the last successful fetch instead.

Copy link
Member

@johnstcn johnstcn May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 from me for duration since last successful fetch

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idiomatic approach is to use unix timestamps, see prometheus_config_last_reload_success_timestamp_seconds.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I guess we have an existing metric for the coder server start timestamp?

Copy link
Contributor Author

@dannykopping dannykopping May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so (or at least not one we export), but I think as long as this metric is updated relative to itself and up is taken into consideration, it should be useful.

[]string{},
nil,
)
)

const (
metricsUpdateInterval = time.Second * 15
metricsUpdateTimeout = time.Second * 10
)

type MetricsCollector struct {
database database.Store
logger slog.Logger
snapshotter prebuilds.StateSnapshotter

latestState atomic.Pointer[metricsState]
}

var _ prometheus.Collector = new(MetricsCollector)

func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger: logger.Named("prebuilds_metrics_collector"),
logger: log,
snapshotter: snapshotter,
}
}
Expand All @@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- lastUpdateDesc
}

// Collect uses the cached state to set configured metrics.
// The state is cached because this function can be called multiple times per second and retrieving the current state
// is an expensive operation.
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
// nolint:gocritic // We need to set an authz context to read metrics from the db.
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
defer cancel()
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
if err != nil {
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func.
if currentState == nil {
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set")
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0)
return
}

for _, metric := range prebuildMetrics {
for _, metric := range currentState.prebuildMetrics {
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
}

snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
if err != nil {
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
return
}

for _, preset := range snapshot.Presets {
for _, preset := range currentState.snapshot.Presets {
if !preset.UsingActiveVersion {
continue
}

presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
if err != nil {
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
continue
}
state := presetSnapshot.CalculateState()
Expand All @@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}

metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}

type metricsState struct {
prebuildMetrics []database.GetPrebuildMetricsRow
snapshot *prebuilds.GlobalSnapshot
createdAt time.Time
}

// BackgroundFetch updates the metrics state every given interval.
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) {
tick := time.NewTicker(time.Nanosecond)
defer tick.Stop()

for {
select {
case <-ctx.Done():
return
case <-tick.C:
// Tick immediately, then set regular interval.
tick.Reset(updateInterval)

if err := mc.UpdateState(ctx, updateTimeout); err != nil {
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err))
}
}
}
}

// UpdateState builds the current metrics state.
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error {
start := time.Now()
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout)
defer fetchCancel()

prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx)
if err != nil {
return xerrors.Errorf("fetch prebuild metrics: %w", err)
}

snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database)
if err != nil {
return xerrors.Errorf("snapshot state: %w", err)
}
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds())))

mc.latestState.Store(&metricsState{
prebuildMetrics: prebuildMetrics,
snapshot: snapshot,
createdAt: dbtime.Now(),
})
return nil
}
5 changes: 5 additions & 0 deletions enterprise/coderd/prebuilds/metricscollector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/coder/quartz"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
Expand Down Expand Up @@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) {
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
}

// Force an update to the metrics state to allow the collector to collect fresh metrics.
// nolint:gocritic // Authz context needed to retrieve state.
require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong))

metricsFamilies, err := registry.Gather()
require.NoError(t, err)

Expand Down
22 changes: 18 additions & 4 deletions enterprise/coderd/prebuilds/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"database/sql"
"fmt"
"math"
"sync"
"sync/atomic"
"time"

Expand Down Expand Up @@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store,
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
}

reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
if err := registerer.Register(reconciler.metrics); err != nil {
// If the registerer fails to register the metrics collector, it's not fatal.
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
if registerer != nil {
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
if err := registerer.Register(reconciler.metrics); err != nil {
// If the registerer fails to register the metrics collector, it's not fatal.
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
}
}

return reconciler
Expand All @@ -87,16 +90,27 @@ func (c *StoreReconciler) Run(ctx context.Context) {
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))

var wg sync.WaitGroup
ticker := c.clock.NewTicker(reconciliationInterval)
defer ticker.Stop()
defer func() {
wg.Wait()
c.done <- struct{}{}
}()

// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
c.cancelFn = cancel

// Start updating metrics in the background.
if c.metrics != nil {
wg.Add(1)
go func() {
defer wg.Done()
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
}()
}

// Everything is in place, reconciler can now be considered as running.
//
// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.
Expand Down
Loading