-
Notifications
You must be signed in to change notification settings - Fork 891
feat: fetch prebuilds metrics state in background #17792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,17 @@ package prebuilds | |
|
||
import ( | ||
"context" | ||
"fmt" | ||
"sync/atomic" | ||
"time" | ||
|
||
"cdr.dev/slog" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
"golang.org/x/xerrors" | ||
|
||
"cdr.dev/slog" | ||
|
||
"github.com/coder/coder/v2/coderd/database" | ||
"github.com/coder/coder/v2/coderd/database/dbauthz" | ||
"github.com/coder/coder/v2/coderd/database/dbtime" | ||
"github.com/coder/coder/v2/coderd/prebuilds" | ||
) | ||
|
||
|
@@ -55,20 +58,34 @@ var ( | |
labels, | ||
nil, | ||
) | ||
lastUpdateDesc = prometheus.NewDesc( | ||
"coderd_prebuilt_workspaces_metrics_last_updated", | ||
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is unix timestamp easy to alert on? Like can you do something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 from me for duration since last successful fetch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idiomatic approach is to use unix timestamps, see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I guess we have an existing metric for the coder server start timestamp? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so (or at least not one we export), but I think as long as this metric is updated relative to itself and |
||
[]string{}, | ||
nil, | ||
) | ||
) | ||
|
||
const ( | ||
metricsUpdateInterval = time.Second * 15 | ||
metricsUpdateTimeout = time.Second * 10 | ||
) | ||
|
||
type MetricsCollector struct { | ||
database database.Store | ||
logger slog.Logger | ||
snapshotter prebuilds.StateSnapshotter | ||
|
||
latestState atomic.Pointer[metricsState] | ||
} | ||
|
||
var _ prometheus.Collector = new(MetricsCollector) | ||
|
||
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { | ||
log := logger.Named("prebuilds_metrics_collector") | ||
return &MetricsCollector{ | ||
database: db, | ||
logger: logger.Named("prebuilds_metrics_collector"), | ||
logger: log, | ||
snapshotter: snapshotter, | ||
} | ||
} | ||
|
@@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { | |
descCh <- desiredPrebuildsDesc | ||
descCh <- runningPrebuildsDesc | ||
descCh <- eligiblePrebuildsDesc | ||
descCh <- lastUpdateDesc | ||
} | ||
|
||
// Collect uses the cached state to set configured metrics. | ||
// The state is cached because this function can be called multiple times per second and retrieving the current state | ||
// is an expensive operation. | ||
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { | ||
// nolint:gocritic // We need to set an authz context to read metrics from the db. | ||
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) | ||
defer cancel() | ||
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) | ||
if err != nil { | ||
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) | ||
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. | ||
if currentState == nil { | ||
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0) | ||
return | ||
} | ||
|
||
for _, metric := range prebuildMetrics { | ||
for _, metric := range currentState.prebuildMetrics { | ||
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
} | ||
|
||
snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database) | ||
if err != nil { | ||
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err)) | ||
return | ||
} | ||
|
||
for _, preset := range snapshot.Presets { | ||
for _, preset := range currentState.snapshot.Presets { | ||
if !preset.UsingActiveVersion { | ||
continue | ||
} | ||
|
||
presetSnapshot, err := snapshot.FilterByPreset(preset.ID) | ||
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) | ||
if err != nil { | ||
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) | ||
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) | ||
continue | ||
} | ||
state := presetSnapshot.CalculateState() | ||
|
@@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { | |
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) | ||
} | ||
|
||
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) | ||
} | ||
|
||
type metricsState struct { | ||
prebuildMetrics []database.GetPrebuildMetricsRow | ||
snapshot *prebuilds.GlobalSnapshot | ||
createdAt time.Time | ||
} | ||
|
||
// BackgroundFetch updates the metrics state every given interval. | ||
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) { | ||
tick := time.NewTicker(time.Nanosecond) | ||
defer tick.Stop() | ||
|
||
for { | ||
select { | ||
case <-ctx.Done(): | ||
return | ||
case <-tick.C: | ||
// Tick immediately, then set regular interval. | ||
tick.Reset(updateInterval) | ||
|
||
if err := mc.UpdateState(ctx, updateTimeout); err != nil { | ||
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err)) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// UpdateState builds the current metrics state. | ||
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { | ||
start := time.Now() | ||
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) | ||
defer fetchCancel() | ||
|
||
prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx) | ||
if err != nil { | ||
return xerrors.Errorf("fetch prebuild metrics: %w", err) | ||
} | ||
|
||
snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database) | ||
if err != nil { | ||
return xerrors.Errorf("snapshot state: %w", err) | ||
} | ||
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) | ||
|
||
mc.latestState.Store(&metricsState{ | ||
prebuildMetrics: prebuildMetrics, | ||
snapshot: snapshot, | ||
createdAt: dbtime.Now(), | ||
}) | ||
return nil | ||
} |
Uh oh!
There was an error while loading. Please reload this page.