@@ -24,6 +24,7 @@ import (
2424
2525 "github.com/armon/circbuf"
2626 "github.com/google/uuid"
27+ "github.com/prometheus/client_golang/prometheus"
2728 "github.com/spf13/afero"
2829 "go.uber.org/atomic"
2930 "golang.org/x/exp/slices"
@@ -63,6 +64,8 @@ type Options struct {
6364 SSHMaxTimeout time.Duration
6465 TailnetListenPort uint16
6566 Subsystem codersdk.AgentSubsystem
67+
68+ PrometheusRegistry * prometheus.Registry
6669}
6770
6871type Client interface {
@@ -102,6 +105,12 @@ func New(options Options) Agent {
102105 return "" , nil
103106 }
104107 }
108+
109+ prometheusRegistry := options .PrometheusRegistry
110+ if prometheusRegistry == nil {
111+ prometheusRegistry = prometheus .NewRegistry ()
112+ }
113+
105114 ctx , cancelFunc := context .WithCancel (context .Background ())
106115 a := & agent {
107116 tailnetListenPort : options .TailnetListenPort ,
@@ -121,6 +130,9 @@ func New(options Options) Agent {
121130 connStatsChan : make (chan * agentsdk.Stats , 1 ),
122131 sshMaxTimeout : options .SSHMaxTimeout ,
123132 subsystem : options .Subsystem ,
133+
134+ prometheusRegistry : prometheusRegistry ,
135+ metrics : newAgentMetrics (prometheusRegistry ),
124136 }
125137 a .init (ctx )
126138 return a
@@ -165,10 +177,13 @@ type agent struct {
165177 latestStat atomic.Pointer [agentsdk.Stats ]
166178
167179 connCountReconnectingPTY atomic.Int64
180+
181+ prometheusRegistry * prometheus.Registry
182+ metrics * agentMetrics
168183}
169184
170185func (a * agent ) init (ctx context.Context ) {
171- sshSrv , err := agentssh .NewServer (ctx , a .logger .Named ("ssh-server" ), a .filesystem , a .sshMaxTimeout , "" )
186+ sshSrv , err := agentssh .NewServer (ctx , a .logger .Named ("ssh-server" ), a .prometheusRegistry , a . filesystem , a .sshMaxTimeout , "" )
172187 if err != nil {
173188 panic (err )
174189 }
@@ -983,6 +998,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
983998
984999func (a * agent ) handleReconnectingPTY (ctx context.Context , logger slog.Logger , msg codersdk.WorkspaceAgentReconnectingPTYInit , conn net.Conn ) (retErr error ) {
9851000 defer conn .Close ()
1001+ a .metrics .connectionsTotal .Add (1 )
9861002
9871003 a .connCountReconnectingPTY .Add (1 )
9881004 defer a .connCountReconnectingPTY .Add (- 1 )
@@ -1022,6 +1038,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10221038 // Empty command will default to the users shell!
10231039 cmd , err := a .sshServer .CreateCommand (ctx , msg .Command , nil )
10241040 if err != nil {
1041+ a .metrics .reconnectingPTYErrors .WithLabelValues ("create_command" ).Add (1 )
10251042 return xerrors .Errorf ("create command: %w" , err )
10261043 }
10271044 cmd .Env = append (cmd .Env , "TERM=xterm-256color" )
@@ -1034,6 +1051,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10341051
10351052 ptty , process , err := pty .Start (cmd )
10361053 if err != nil {
1054+ a .metrics .reconnectingPTYErrors .WithLabelValues ("start_command" ).Add (1 )
10371055 return xerrors .Errorf ("start command: %w" , err )
10381056 }
10391057
@@ -1060,7 +1078,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10601078 if err != nil {
10611079 // When the PTY is closed, this is triggered.
10621080 // Error is typically a benign EOF, so only log for debugging.
1063- logger .Debug (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1081+ if errors .Is (err , io .EOF ) {
1082+ logger .Debug (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1083+ } else {
1084+ logger .Warn (ctx , "unable to read pty output, command exited?" , slog .Error (err ))
1085+ a .metrics .reconnectingPTYErrors .WithLabelValues ("output_reader" ).Add (1 )
1086+ }
10641087 break
10651088 }
10661089 part := buffer [:read ]
@@ -1075,11 +1098,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10751098 for cid , conn := range rpty .activeConns {
10761099 _ , err = conn .Write (part )
10771100 if err != nil {
1078- logger .Debug (ctx ,
1101+ logger .Warn (ctx ,
10791102 "error writing to active conn" ,
10801103 slog .F ("other_conn_id" , cid ),
10811104 slog .Error (err ),
10821105 )
1106+ a .metrics .reconnectingPTYErrors .WithLabelValues ("write" ).Add (1 )
10831107 }
10841108 }
10851109 rpty .activeConnsMutex .Unlock ()
@@ -1099,6 +1123,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10991123 if err != nil {
11001124 // We can continue after this, it's not fatal!
11011125 logger .Error (ctx , "resize" , slog .Error (err ))
1126+ a .metrics .reconnectingPTYErrors .WithLabelValues ("resize" ).Add (1 )
11021127 }
11031128 // Write any previously stored data for the TTY.
11041129 rpty .circularBufferMutex .RLock ()
@@ -1111,6 +1136,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11111136 // while also holding circularBufferMutex seems dangerous.
11121137 _ , err = conn .Write (prevBuf )
11131138 if err != nil {
1139+ a .metrics .reconnectingPTYErrors .WithLabelValues ("write" ).Add (1 )
11141140 return xerrors .Errorf ("write buffer to conn: %w" , err )
11151141 }
11161142 // Multiple connections to the same TTY are permitted.
@@ -1161,6 +1187,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11611187 _ , err = rpty .ptty .InputWriter ().Write ([]byte (req .Data ))
11621188 if err != nil {
11631189 logger .Warn (ctx , "write to pty" , slog .Error (err ))
1190+ a .metrics .reconnectingPTYErrors .WithLabelValues ("input_writer" ).Add (1 )
11641191 return nil
11651192 }
11661193 // Check if a resize needs to happen!
@@ -1171,6 +1198,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11711198 if err != nil {
11721199 // We can continue after this, it's not fatal!
11731200 logger .Error (ctx , "resize" , slog .Error (err ))
1201+ a .metrics .reconnectingPTYErrors .WithLabelValues ("resize" ).Add (1 )
11741202 }
11751203 }
11761204}
@@ -1203,7 +1231,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12031231 var mu sync.Mutex
12041232 status := a .network .Status ()
12051233 durations := []float64 {}
1206- ctx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
1234+ pingCtx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
12071235 defer cancelFunc ()
12081236 for nodeID , peer := range status .Peer {
12091237 if ! peer .Active {
@@ -1219,7 +1247,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12191247 wg .Add (1 )
12201248 go func () {
12211249 defer wg .Done ()
1222- duration , _ , _ , err := a .network .Ping (ctx , addresses [0 ].Addr ())
1250+ duration , _ , _ , err := a .network .Ping (pingCtx , addresses [0 ].Addr ())
12231251 if err != nil {
12241252 return
12251253 }
@@ -1244,7 +1272,10 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12441272 // Collect agent metrics.
12451273 // Agent metrics are changing all the time, so there is no need to perform
12461274 // reflect.DeepEqual to see if stats should be transferred.
1247- stats .Metrics = collectMetrics ()
1275+
1276+ metricsCtx , cancelFunc := context .WithTimeout (ctx , 5 * time .Second )
1277+ defer cancelFunc ()
1278+ stats .Metrics = a .collectMetrics (metricsCtx )
12481279
12491280 a .latestStat .Store (stats )
12501281
0 commit comments