@@ -24,6 +24,7 @@ import (
24
24
25
25
"github.com/armon/circbuf"
26
26
"github.com/google/uuid"
27
+ "github.com/prometheus/client_golang/prometheus"
27
28
"github.com/spf13/afero"
28
29
"go.uber.org/atomic"
29
30
"golang.org/x/exp/slices"
@@ -63,6 +64,8 @@ type Options struct {
63
64
SSHMaxTimeout time.Duration
64
65
TailnetListenPort uint16
65
66
Subsystem codersdk.AgentSubsystem
67
+
68
+ PrometheusRegistry * prometheus.Registry
66
69
}
67
70
68
71
type Client interface {
@@ -102,6 +105,12 @@ func New(options Options) Agent {
102
105
return "" , nil
103
106
}
104
107
}
108
+
109
+ prometheusRegistry := options .PrometheusRegistry
110
+ if prometheusRegistry == nil {
111
+ prometheusRegistry = prometheus .NewRegistry ()
112
+ }
113
+
105
114
ctx , cancelFunc := context .WithCancel (context .Background ())
106
115
a := & agent {
107
116
tailnetListenPort : options .TailnetListenPort ,
@@ -121,6 +130,9 @@ func New(options Options) Agent {
121
130
connStatsChan : make (chan * agentsdk.Stats , 1 ),
122
131
sshMaxTimeout : options .SSHMaxTimeout ,
123
132
subsystem : options .Subsystem ,
133
+
134
+ prometheusRegistry : prometheusRegistry ,
135
+ metrics : newAgentMetrics (prometheusRegistry ),
124
136
}
125
137
a .init (ctx )
126
138
return a
@@ -165,10 +177,13 @@ type agent struct {
165
177
latestStat atomic.Pointer [agentsdk.Stats ]
166
178
167
179
connCountReconnectingPTY atomic.Int64
180
+
181
+ prometheusRegistry * prometheus.Registry
182
+ metrics * agentMetrics
168
183
}
169
184
170
185
func (a * agent ) init (ctx context.Context ) {
171
- sshSrv , err := agentssh .NewServer (ctx , a .logger .Named ("ssh-server" ), a .filesystem , a .sshMaxTimeout , "" )
186
+ sshSrv , err := agentssh .NewServer (ctx , a .logger .Named ("ssh-server" ), a .prometheusRegistry , a . filesystem , a .sshMaxTimeout , "" )
172
187
if err != nil {
173
188
panic (err )
174
189
}
@@ -983,6 +998,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
983
998
984
999
func (a * agent ) handleReconnectingPTY (ctx context.Context , logger slog.Logger , msg codersdk.WorkspaceAgentReconnectingPTYInit , conn net.Conn ) (retErr error ) {
985
1000
defer conn .Close ()
1001
+ a .metrics .connectionsTotal .Add (1 )
986
1002
987
1003
a .connCountReconnectingPTY .Add (1 )
988
1004
defer a .connCountReconnectingPTY .Add (- 1 )
@@ -1022,6 +1038,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1022
1038
// Empty command will default to the users shell!
1023
1039
cmd , err := a .sshServer .CreateCommand (ctx , msg .Command , nil )
1024
1040
if err != nil {
1041
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("create_command" ).Add (1 )
1025
1042
return xerrors .Errorf ("create command: %w" , err )
1026
1043
}
1027
1044
cmd .Env = append (cmd .Env , "TERM=xterm-256color" )
@@ -1034,6 +1051,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1034
1051
1035
1052
ptty , process , err := pty .Start (cmd )
1036
1053
if err != nil {
1054
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("start_command" ).Add (1 )
1037
1055
return xerrors .Errorf ("start command: %w" , err )
1038
1056
}
1039
1057
@@ -1060,7 +1078,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1060
1078
if err != nil {
1061
1079
// When the PTY is closed, this is triggered.
1062
1080
// Error is typically a benign EOF, so only log for debugging.
1063
- logger .Debug (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1081
+ if errors .Is (err , io .EOF ) {
1082
+ logger .Debug (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1083
+ } else {
1084
+ logger .Warn (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1085
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("output_reader" ).Add (1 )
1086
+ }
1064
1087
break
1065
1088
}
1066
1089
part := buffer [:read ]
@@ -1075,11 +1098,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1075
1098
for cid , conn := range rpty .activeConns {
1076
1099
_ , err = conn .Write (part )
1077
1100
if err != nil {
1078
- logger .Debug (ctx ,
1101
+ logger .Warn (ctx ,
1079
1102
"error writing to active conn" ,
1080
1103
slog .F ("other_conn_id" , cid ),
1081
1104
slog .Error (err ),
1082
1105
)
1106
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("write" ).Add (1 )
1083
1107
}
1084
1108
}
1085
1109
rpty .activeConnsMutex .Unlock ()
@@ -1099,6 +1123,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1099
1123
if err != nil {
1100
1124
// We can continue after this, it's not fatal!
1101
1125
logger .Error (ctx , "resize" , slog .Error (err ))
1126
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("resize" ).Add (1 )
1102
1127
}
1103
1128
// Write any previously stored data for the TTY.
1104
1129
rpty .circularBufferMutex .RLock ()
@@ -1111,6 +1136,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1111
1136
// while also holding circularBufferMutex seems dangerous.
1112
1137
_ , err = conn .Write (prevBuf )
1113
1138
if err != nil {
1139
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("write" ).Add (1 )
1114
1140
return xerrors .Errorf ("write buffer to conn: %w" , err )
1115
1141
}
1116
1142
// Multiple connections to the same TTY are permitted.
@@ -1161,6 +1187,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1161
1187
_ , err = rpty .ptty .InputWriter ().Write ([]byte (req .Data ))
1162
1188
if err != nil {
1163
1189
logger .Warn (ctx , "write to pty" , slog .Error (err ))
1190
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("input_writer" ).Add (1 )
1164
1191
return nil
1165
1192
}
1166
1193
// Check if a resize needs to happen!
@@ -1171,6 +1198,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
1171
1198
if err != nil {
1172
1199
// We can continue after this, it's not fatal!
1173
1200
logger .Error (ctx , "resize" , slog .Error (err ))
1201
+ a .metrics .reconnectingPTYErrors .WithLabelValues ("resize" ).Add (1 )
1174
1202
}
1175
1203
}
1176
1204
}
@@ -1203,7 +1231,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
1203
1231
var mu sync.Mutex
1204
1232
status := a .network .Status ()
1205
1233
durations := []float64 {}
1206
- ctx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
1234
+ pingCtx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
1207
1235
defer cancelFunc ()
1208
1236
for nodeID , peer := range status .Peer {
1209
1237
if ! peer .Active {
@@ -1219,7 +1247,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
1219
1247
wg .Add (1 )
1220
1248
go func () {
1221
1249
defer wg .Done ()
1222
- duration , _ , _ , err := a .network .Ping (ctx , addresses [0 ].Addr ())
1250
+ duration , _ , _ , err := a .network .Ping (pingCtx , addresses [0 ].Addr ())
1223
1251
if err != nil {
1224
1252
return
1225
1253
}
@@ -1244,7 +1272,10 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
1244
1272
// Collect agent metrics.
1245
1273
// Agent metrics are changing all the time, so there is no need to perform
1246
1274
// reflect.DeepEqual to see if stats should be transferred.
1247
- stats .Metrics = collectMetrics ()
1275
+
1276
+ metricsCtx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
1277
+ defer cancelFunc ()
1278
+ stats .Metrics = a .collectMetrics (metricsCtx )
1248
1279
1249
1280
a .latestStat .Store (stats )
1250
1281
0 commit comments