@@ -122,24 +122,24 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
122
122
duration = 15 * time .Second // TODO 5 * time.Minute
123
123
}
124
124
125
- workspaceAgentsGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
125
+ agentsGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
126
126
Namespace : "coderd" ,
127
127
Subsystem : "agents" ,
128
128
Name : "up" ,
129
129
Help : "The number of active agents per workspace." ,
130
130
}, []string {"username" , "workspace_name" })
131
- err := registerer .Register (workspaceAgentsGauge )
131
+ err := registerer .Register (agentsGauge )
132
132
if err != nil {
133
133
return nil , err
134
134
}
135
135
136
- agentsConnectionGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
136
+ agentsConnectionsGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
137
137
Namespace : "coderd" ,
138
138
Subsystem : "agents" ,
139
139
Name : "connections" ,
140
140
Help : "Agent connections with statuses." ,
141
141
}, []string {"agent_name" , "username" , "workspace_name" , "status" , "lifecycle_state" , "tailnet_node" })
142
- err = registerer .Register (agentsConnectionGauge )
142
+ err = registerer .Register (agentsConnectionsGauge )
143
143
if err != nil {
144
144
return nil , err
145
145
}
@@ -155,6 +155,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
155
155
return nil , err
156
156
}
157
157
158
+ agentsAppsGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
159
+ Namespace : "coderd" ,
160
+ Subsystem : "agents" ,
161
+ Name : "apps" ,
162
+ Help : "Agent applications with statuses." ,
163
+ }, []string {"agent_name" , "username" , "workspace_name" , "app_name" , "health" })
164
+ err = registerer .Register (agentsAppsGauge )
165
+ if err != nil {
166
+ return nil , err
167
+ }
168
+
158
169
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
159
170
ctx , cancelFunc := context .WithCancel (dbauthz .AsSystemRestricted (ctx ))
160
171
ticker := time .NewTicker (duration )
@@ -167,7 +178,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
167
178
case <- ticker .C :
168
179
}
169
180
170
- logger .Info (ctx , "Collect agent metrics now" )
181
+ logger .Debug (ctx , "Collect agent metrics now" )
171
182
172
183
workspaceRows , err := db .GetWorkspaces (ctx , database.GetWorkspacesParams {
173
184
AgentInactiveDisconnectTimeoutSeconds : int64 (agentInactiveDisconnectTimeout .Seconds ()),
@@ -177,34 +188,35 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
177
188
continue
178
189
}
179
190
180
- workspaceAgentsGauge .Reset ()
181
- agentsConnectionGauge .Reset ()
191
+ agentsGauge .Reset ()
192
+ agentsConnectionsGauge .Reset ()
182
193
agentsConnectionLatenciesGauge .Reset ()
194
+ agentsAppsGauge .Reset ()
183
195
184
196
for _ , workspace := range workspaceRows {
185
197
user , err := db .GetUserByID (ctx , workspace .OwnerID )
186
198
if err != nil {
187
- logger .Error (ctx , "can't get user" , slog .Error ( err ), slog . F ("user_id" , workspace .OwnerID ))
188
- workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
199
+ logger .Error (ctx , "can't get user" , slog .F ("user_id" , workspace .OwnerID ), slog . Error ( err ))
200
+ agentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
189
201
continue
190
202
}
191
203
192
204
agents , err := db .GetWorkspaceAgentsInLatestBuildByWorkspaceID (ctx , workspace .ID )
193
205
if err != nil {
194
- logger .Error (ctx , "can't get workspace agents" , slog .F ("workspace_name " , workspace .Name ), slog .Error (err ))
195
- workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
206
+ logger .Error (ctx , "can't get workspace agents" , slog .F ("workspace_id " , workspace .ID ), slog .Error (err ))
207
+ agentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
196
208
continue
197
209
}
198
210
199
211
if len (agents ) == 0 {
200
- logger .Info (ctx , "workspace agents are unavailable" , slog .F ("workspace_name " , workspace .Name ))
201
- workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
212
+ logger .Debug (ctx , "workspace agents are unavailable" , slog .F ("workspace_id " , workspace .ID ))
213
+ agentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
202
214
continue
203
215
}
204
216
205
217
for _ , agent := range agents {
206
218
// Collect information about agents
207
- workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (1 )
219
+ agentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (1 )
208
220
209
221
connectionStatus := agent .Status (agentInactiveDisconnectTimeout )
210
222
node := (* coordinator .Load ()).Node (agent .ID )
@@ -214,37 +226,46 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
214
226
tailnetNode = node .ID .String ()
215
227
}
216
228
217
- agentsConnectionGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , string (connectionStatus .Status ), string (agent .LifecycleState ), tailnetNode ).Set (1 )
229
+ agentsConnectionsGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , string (connectionStatus .Status ), string (agent .LifecycleState ), tailnetNode ).Set (1 )
218
230
219
231
if node == nil {
220
- logger .Info (ctx , "can't read in-memory node for agent" , slog .F ("workspace_name " , workspace . Name ), slog . F ( "agent_name" , agent .Name ))
232
+ logger .Debug (ctx , "can't read in-memory node for agent" , slog .F ("agent_id " , agent .ID ))
221
233
continue
222
- }
234
+ } else {
235
+ // Collect information about connection latencies
236
+ for rawRegion , latency := range node .DERPLatency {
237
+ regionParts := strings .SplitN (rawRegion , "-" , 2 )
238
+ regionID , err := strconv .Atoi (regionParts [0 ])
239
+ if err != nil {
240
+ logger .Error (ctx , "can't convert DERP region" , slog .F ("agent_id" , agent .ID ), slog .F ("raw_region" , rawRegion ), slog .Error (err ))
241
+ continue
242
+ }
223
243
224
- // Collect information about connection latencies
225
- for rawRegion , latency := range node .DERPLatency {
226
- regionParts := strings .SplitN (rawRegion , "-" , 2 )
227
- regionID , err := strconv .Atoi (regionParts [0 ])
228
- if err != nil {
229
- logger .Error (ctx , "can't convert DERP region" , slog .Error (err ), slog .F ("agent_name" , agent .Name ), slog .F ("raw_region" , rawRegion ))
230
- continue
231
- }
232
- region , found := derpMap .Regions [regionID ]
233
- if ! found {
234
- // It's possible that a workspace agent is using an old DERPMap
235
- // and reports regions that do not exist. If that's the case,
236
- // report the region as unknown!
237
- region = & tailcfg.DERPRegion {
238
- RegionID : regionID ,
239
- RegionName : fmt .Sprintf ("Unnamed %d" , regionID ),
244
+ region , found := derpMap .Regions [regionID ]
245
+ if ! found {
246
+ // It's possible that a workspace agent is using an old DERPMap
247
+ // and reports regions that do not exist. If that's the case,
248
+ // report the region as unknown!
249
+ region = & tailcfg.DERPRegion {
250
+ RegionID : regionID ,
251
+ RegionName : fmt .Sprintf ("Unnamed %d" , regionID ),
252
+ }
240
253
}
254
+
255
+ agentsConnectionLatenciesGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , region .RegionName , fmt .Sprintf ("%v" , node .PreferredDERP == regionID )).Set (latency )
241
256
}
257
+ }
242
258
243
- agentsConnectionLatenciesGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , region .RegionName , fmt .Sprintf ("%v" , node .PreferredDERP == regionID )).Set (latency )
259
+ // Collect information about registered applications
260
+ apps , err := db .GetWorkspaceAppsByAgentID (ctx , agent .ID )
261
+ if err != nil {
262
+ logger .Error (ctx , "can't get workspace apps" , slog .F ("agent_id" , agent .ID ), slog .Error (err ))
263
+ continue
244
264
}
245
265
246
- // FIXME IDE?
247
- // FIXME connection_type ide
266
+ for _ , app := range apps {
267
+ agentsAppsGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , app .DisplayName , string (app .Health )).Add (1 )
268
+ }
248
269
}
249
270
}
250
271
}
0 commit comments