Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: expose agent stats via Prometheus endpoint #7115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 43 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8d4e67d
WIP
mtojek Apr 3, 2023
da729e6
Merge branch 'main' into 6724-metrics
mtojek Apr 4, 2023
9ad09b2
WIP
mtojek Apr 4, 2023
440657c
WIP
mtojek Apr 5, 2023
8764f89
Agents
mtojek Apr 5, 2023
663b5d5
fix
mtojek Apr 5, 2023
63aff5e
1min
mtojek Apr 5, 2023
3905481
fix
mtojek Apr 5, 2023
f8d6f46
WIP
mtojek Apr 5, 2023
d487a77
Test
mtojek Apr 5, 2023
7acbaf0
docs
mtojek Apr 5, 2023
7418779
fmt
mtojek Apr 5, 2023
3a8e4e6
Add timer to measure the metrics collection
mtojek Apr 6, 2023
b5d0581
Use CachedGaugeVec
mtojek Apr 6, 2023
e4d708b
Unit tests
mtojek Apr 6, 2023
199e549
WIP
mtojek Apr 7, 2023
7307bd3
Merge branch 'main' into 6724-metrics-2
mtojek Apr 12, 2023
d0b8398
WIP
mtojek Apr 13, 2023
f0c0418
db: GetWorkspaceAgentStatsAndLabels
mtojek Apr 13, 2023
970d35a
fmt
mtojek Apr 13, 2023
229f546
WIP
mtojek Apr 13, 2023
7070e0e
Merge branch 'main' into 6724-metrics-2
mtojek Apr 13, 2023
8c6f96b
gauges
mtojek Apr 13, 2023
1ed37b4
feat: collect
mtojek Apr 13, 2023
7ee1bfc
fix
mtojek Apr 13, 2023
2b8a9e4
fmt
mtojek Apr 13, 2023
322f7e8
minor fixes
mtojek Apr 14, 2023
c7af75a
Prometheus flag
mtojek Apr 14, 2023
9693fa8
fix
mtojek Apr 14, 2023
28f7a13
WIP
mtojek Apr 14, 2023
7878167
fix tests
mtojek Apr 14, 2023
d9e4903
WIP
mtojek Apr 14, 2023
0d37c85
fix json
mtojek Apr 14, 2023
f752c6f
Rx Tx bytes
mtojek Apr 14, 2023
9c7aef8
CloseFunc
mtojek Apr 14, 2023
5290571
fix
mtojek Apr 14, 2023
1cbe59b
fix
mtojek Apr 14, 2023
f8f11eb
Fixes
mtojek Apr 14, 2023
4ffae11
fix
mtojek Apr 14, 2023
7ba16b5
fix: IgnoreErrors
mtojek Apr 14, 2023
2a4c674
Fix: Windows
mtojek Apr 14, 2023
201da83
fix
mtojek Apr 14, 2023
ba52c45
reflect.DeepEquals
mtojek Apr 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
WIP
  • Loading branch information
mtojek committed Apr 14, 2023
commit 28f7a13216d75afec3a5fc723ddb6fd62b009d40
50 changes: 23 additions & 27 deletions coderd/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,38 +417,34 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
if err != nil {
logger.Error(ctx, "can't get agent stats", slog.Error(err))
goto done
}

if len(stats) == 0 {
goto done
}
} else {
for _, agentStat := range stats {
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)

agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)

agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
}

for _, agentStat := range stats {
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
if len(stats) > 0 {
agentStatsRxBytesGauge.Commit()
agentStatsTxBytesGauge.Commit()

agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionCountGauge.Commit()
agentStatsConnectionMedianLatencyGauge.Commit()

agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountJetBrainsGauge.Commit()
agentStatsSessionCountReconnectingPTYGauge.Commit()
agentStatsSessionCountSSHGauge.Commit()
agentStatsSessionCountVSCodeGauge.Commit()
}
}

agentStatsRxBytesGauge.Commit()
agentStatsTxBytesGauge.Commit()

agentStatsConnectionCountGauge.Commit()
agentStatsConnectionMedianLatencyGauge.Commit()

agentStatsSessionCountJetBrainsGauge.Commit()
agentStatsSessionCountReconnectingPTYGauge.Commit()
agentStatsSessionCountSSHGauge.Commit()
agentStatsSessionCountVSCodeGauge.Commit()

done:
logger.Debug(ctx, "Agent metrics collection is done")
metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds())

Expand Down
104 changes: 69 additions & 35 deletions coderd/prometheusmetrics/prometheusmetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package prometheusmetrics_test
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"sync/atomic"
"testing"
"time"
Expand Down Expand Up @@ -357,24 +360,15 @@ func TestAgents(t *testing.T) {
func TestAgentStats(t *testing.T) {
t.Parallel()

// Build a sample workspace with test agent and fake agent client
// Build sample workspaces with test agents and fake agent client
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
db := api.Database

user := coderdtest.CreateFirstUser(t, client)
authToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

agentClient := agentsdk.New(client.URL)
agentClient.SetSessionToken(authToken)
agent1, _ := prepareWorkspaceAndAgent(t, client, user, 1)
agent2, _ := prepareWorkspaceAndAgent(t, client, user, 2)
agent3, _ := prepareWorkspaceAndAgent(t, client, user, 3)

registry := prometheus.NewRegistry()

Expand All @@ -384,24 +378,45 @@ func TestAgentStats(t *testing.T) {
t.Cleanup(cancel)

// when
_, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{
ConnectionsByProto: map[string]int64{"TCP": 1},
ConnectionCount: 2,
RxPackets: 3,
RxBytes: 4,
TxPackets: 5,
TxBytes: 6,
SessionCountVSCode: 7,
SessionCountJetBrains: 8,
SessionCountReconnectingPTY: 9,
SessionCountSSH: 10,
ConnectionMedianLatencyMS: 10000,
})
var i int64
for i = 0; i < 3; i++ {
_, err = agent1.PostStats(context.Background(), &agentsdk.Stats{
TxBytes: 1 + i, RxBytes: 2 + i,
SessionCountVSCode: 3 + i, SessionCountJetBrains: 4 + i, SessionCountReconnectingPTY: 5 + i, SessionCountSSH: 6 + i,
ConnectionCount: 7 + i, ConnectionMedianLatencyMS: 8000,
ConnectionsByProto: map[string]int64{"TCP": 1},
})
require.NoError(t, err)

_, err = agent2.PostStats(context.Background(), &agentsdk.Stats{
TxBytes: 2 + i, RxBytes: 4 + i,
SessionCountVSCode: 6 + i, SessionCountJetBrains: 8 + i, SessionCountReconnectingPTY: 10 + i, SessionCountSSH: 12 + i,
ConnectionCount: 8 + i, ConnectionMedianLatencyMS: 10000,
ConnectionsByProto: map[string]int64{"TCP": 1},
})
require.NoError(t, err)

_, err = agent3.PostStats(context.Background(), &agentsdk.Stats{
TxBytes: 3 + i, RxBytes: 6 + i,
SessionCountVSCode: 12 + i, SessionCountJetBrains: 14 + i, SessionCountReconnectingPTY: 16 + i, SessionCountSSH: 18 + i,
ConnectionCount: 9 + i, ConnectionMedianLatencyMS: 12000,
ConnectionsByProto: map[string]int64{"TCP": 1},
})
require.NoError(t, err)
}

// then
goldenFile, err := os.ReadFile("testdata/agent-stats.json")
require.NoError(t, err)
areMetricsValid := func(collected map[string]int) bool {
out, err := json.MarshalIndent(collected, " ", " ")
require.NoError(t, err)
os.WriteFile("testdata/agent-stats.json", out, 0644)
return string(goldenFile) == string(out)
}

collectedMetrics := map[string]struct{}{}
collected := map[string]int{}
var executionSeconds bool
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
Expand All @@ -413,7 +428,7 @@ func TestAgentStats(t *testing.T) {
for _, metric := range metrics {
switch metric.GetName() {
case "coderd_prometheusmetrics_agentstats_execution_seconds":
collectedMetrics[metric.GetName()] = struct{}{}
executionSeconds = true
case "coderd_agentstats_connection_count",
"coderd_agentstats_connection_median_latency_seconds",
"coderd_agentstats_rx_bytes",
Expand All @@ -422,16 +437,35 @@ func TestAgentStats(t *testing.T) {
"coderd_agentstats_session_count_reconnecting_pty",
"coderd_agentstats_session_count_ssh",
"coderd_agentstats_session_count_vscode":
collectedMetrics[metric.GetName()] = struct{}{}
assert.Equal(t, "example", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "testuser", metric.Metric[0].Label[1].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[2].GetValue()) // Workspace name
assert.NotZero(t, int(metric.Metric[0].Gauge.GetValue()), metric.GetName()) // Metric value
for _, m := range metric.Metric {
// username:workspace:agent:metric = value
collected[m.Label[1].GetValue()+":"+m.Label[2].GetValue()+":"+m.Label[0].GetValue()+":"+metric.GetName()] = int(m.Gauge.GetValue())
}
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
}
}
return executionSeconds && areMetricsValid(collected)
}, testutil.WaitLong, testutil.IntervalMedium)
}

func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) (*agentsdk.Client, codersdk.Workspace) {
authToken := uuid.NewString()

version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.Name = fmt.Sprintf("workspace-%d", workspaceNum)
})
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

agentClient := agentsdk.New(client.URL)
agentClient.SetSessionToken(authToken)

return len(collectedMetrics) == 9
}, testutil.WaitShort, testutil.IntervalFast, "collected metrics: %v", collectedMetrics)
return agentClient, workspace
}
26 changes: 26 additions & 0 deletions coderd/prometheusmetrics/testdata/agent-stats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"testuser:workspace-1:example:coderd_agentstats_connection_count": 9,
"testuser:workspace-1:example:coderd_agentstats_connection_median_latency_seconds": 8,
"testuser:workspace-1:example:coderd_agentstats_rx_bytes": 9,
"testuser:workspace-1:example:coderd_agentstats_session_count_jetbrains": 6,
"testuser:workspace-1:example:coderd_agentstats_session_count_reconnecting_pty": 7,
"testuser:workspace-1:example:coderd_agentstats_session_count_ssh": 8,
"testuser:workspace-1:example:coderd_agentstats_session_count_vscode": 5,
"testuser:workspace-1:example:coderd_agentstats_tx_bytes": 6,
"testuser:workspace-2:example:coderd_agentstats_connection_count": 10,
"testuser:workspace-2:example:coderd_agentstats_connection_median_latency_seconds": 10,
"testuser:workspace-2:example:coderd_agentstats_rx_bytes": 15,
"testuser:workspace-2:example:coderd_agentstats_session_count_jetbrains": 10,
"testuser:workspace-2:example:coderd_agentstats_session_count_reconnecting_pty": 12,
"testuser:workspace-2:example:coderd_agentstats_session_count_ssh": 14,
"testuser:workspace-2:example:coderd_agentstats_session_count_vscode": 8,
"testuser:workspace-2:example:coderd_agentstats_tx_bytes": 9,
"testuser:workspace-3:example:coderd_agentstats_connection_count": 11,
"testuser:workspace-3:example:coderd_agentstats_connection_median_latency_seconds": 12,
"testuser:workspace-3:example:coderd_agentstats_rx_bytes": 21,
"testuser:workspace-3:example:coderd_agentstats_session_count_jetbrains": 16,
"testuser:workspace-3:example:coderd_agentstats_session_count_reconnecting_pty": 18,
"testuser:workspace-3:example:coderd_agentstats_session_count_ssh": 20,
"testuser:workspace-3:example:coderd_agentstats_session_count_vscode": 14,
"testuser:workspace-3:example:coderd_agentstats_tx_bytes": 12
}