Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0347231

Browse files
authored
feat: expose agent metrics via Prometheus endpoint (coder#7011)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * Address PR comments
1 parent dd85ea8 commit 0347231

File tree

7 files changed

+629
-48
lines changed

7 files changed

+629
-48
lines changed

cli/server.go

+9
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
896896
return xerrors.Errorf("create coder API: %w", err)
897897
}
898898

899+
if cfg.Prometheus.Enable {
900+
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
901+
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
902+
if err != nil {
903+
return xerrors.Errorf("register agents prometheus metric: %w", err)
904+
}
905+
defer closeAgentsFunc()
906+
}
907+
899908
client := codersdk.New(localURL)
900909
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
901910
// The certificate will likely be self-signed or for a different

coderd/prometheusmetrics/collector.go

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package prometheusmetrics
2+
3+
import (
4+
"sync"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
)
8+
9+
// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows
10+
// for staging changes in the metrics vector. Calling "WithLabelValues(...)"
11+
// will update the internal gauge value, but it will not be returned by
12+
// "Collect(...)" until the "Commit()" method is called. The "Commit()" method
13+
// resets the internal gauge and applies all staged changes to it.
14+
//
15+
// The Use of CachedGaugeVec is recommended for use cases when there is a risk
16+
// that the Prometheus collector receives incomplete metrics, collected
17+
// in the middle of metrics recalculation, between "Reset()" and the last
18+
// "WithLabelValues()" call.
19+
type CachedGaugeVec struct {
20+
m sync.Mutex
21+
22+
gaugeVec *prometheus.GaugeVec
23+
records []vectorRecord
24+
}
25+
26+
var _ prometheus.Collector = new(CachedGaugeVec)
27+
28+
type VectorOperation int
29+
30+
const (
31+
VectorOperationAdd VectorOperation = iota
32+
VectorOperationSet
33+
)
34+
35+
type vectorRecord struct {
36+
operation VectorOperation
37+
value float64
38+
labelValues []string
39+
}
40+
41+
func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
42+
return &CachedGaugeVec{
43+
gaugeVec: gaugeVec,
44+
}
45+
}
46+
47+
func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
48+
v.gaugeVec.Describe(desc)
49+
}
50+
51+
func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
52+
v.m.Lock()
53+
defer v.m.Unlock()
54+
55+
v.gaugeVec.Collect(ch)
56+
}
57+
58+
func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
59+
switch operation {
60+
case VectorOperationAdd:
61+
case VectorOperationSet:
62+
default:
63+
panic("unsupported vector operation")
64+
}
65+
66+
v.m.Lock()
67+
defer v.m.Unlock()
68+
69+
v.records = append(v.records, vectorRecord{
70+
operation: operation,
71+
value: value,
72+
labelValues: labelValues,
73+
})
74+
}
75+
76+
// Commit will set the internal value as the cached value to return from "Collect()".
77+
// The internal metric value is completely reset, so the caller should expect
78+
// the gauge to be empty for the next 'WithLabelValues' values.
79+
func (v *CachedGaugeVec) Commit() {
80+
v.m.Lock()
81+
defer v.m.Unlock()
82+
83+
v.gaugeVec.Reset()
84+
for _, record := range v.records {
85+
g := v.gaugeVec.WithLabelValues(record.labelValues...)
86+
switch record.operation {
87+
case VectorOperationAdd:
88+
g.Add(record.value)
89+
case VectorOperationSet:
90+
g.Set(record.value)
91+
}
92+
}
93+
94+
v.records = nil
95+
}
+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package prometheusmetrics_test
2+
3+
import (
4+
"sort"
5+
"testing"
6+
7+
"github.com/prometheus/client_golang/prometheus"
8+
dto "github.com/prometheus/client_model/go"
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
12+
"github.com/coder/coder/coderd/prometheusmetrics"
13+
)
14+
15+
func TestCollector_Add(t *testing.T) {
16+
t.Parallel()
17+
18+
// given
19+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
20+
Namespace: "coderd",
21+
Subsystem: "agents",
22+
Name: "up",
23+
Help: "The number of active agents per workspace.",
24+
}, []string{"username", "workspace_name"}))
25+
26+
// when
27+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
28+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace")
29+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace")
30+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace")
31+
agentsGauge.Commit()
32+
33+
// then
34+
ch := make(chan prometheus.Metric, 2)
35+
agentsGauge.Collect(ch)
36+
37+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
38+
39+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
40+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
41+
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
42+
43+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
44+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
45+
assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value
46+
}
47+
48+
func TestCollector_Set(t *testing.T) {
49+
t.Parallel()
50+
51+
// given
52+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
53+
Namespace: "coderd",
54+
Subsystem: "agents",
55+
Name: "up",
56+
Help: "The number of active agents per workspace.",
57+
}, []string{"username", "workspace_name"}))
58+
59+
// when
60+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace")
61+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
62+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
63+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace")
64+
agentsGauge.Commit()
65+
66+
// then
67+
ch := make(chan prometheus.Metric, 2)
68+
agentsGauge.Collect(ch)
69+
70+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
71+
72+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
73+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
74+
assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value
75+
76+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
77+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
78+
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
79+
}
80+
81+
func TestCollector_Set_Add(t *testing.T) {
82+
t.Parallel()
83+
84+
// given
85+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
86+
Namespace: "coderd",
87+
Subsystem: "agents",
88+
Name: "up",
89+
Help: "The number of active agents per workspace.",
90+
}, []string{"username", "workspace_name"}))
91+
92+
// when
93+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace")
94+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace")
95+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
96+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace")
97+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
98+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
99+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace")
100+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace")
101+
agentsGauge.Commit()
102+
103+
// then
104+
ch := make(chan prometheus.Metric, 2)
105+
agentsGauge.Collect(ch)
106+
107+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
108+
109+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
110+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
111+
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
112+
113+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
114+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
115+
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
116+
}
117+
118+
func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric {
119+
ch := make(chan prometheus.Metric, count)
120+
defer close(ch)
121+
122+
var metrics []dto.Metric
123+
124+
collector.Collect(ch)
125+
for i := 0; i < count; i++ {
126+
m := <-ch
127+
128+
var metric dto.Metric
129+
err := m.Write(&metric)
130+
require.NoError(t, err)
131+
132+
metrics = append(metrics, metric)
133+
}
134+
135+
// Ensure always the same order of metrics
136+
sort.Slice(metrics, func(i, j int) bool {
137+
return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()})
138+
})
139+
return metrics
140+
}

0 commit comments

Comments
 (0)