Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 14efdad

Browse files
authored
feat: Collect agent SSH metrics (coder#7584)
1 parent 05da1e9 commit 14efdad

File tree

16 files changed

+555
-59
lines changed

16 files changed

+555
-59
lines changed

agent/agent.go

+37-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/armon/circbuf"
2626
"github.com/google/uuid"
27+
"github.com/prometheus/client_golang/prometheus"
2728
"github.com/spf13/afero"
2829
"go.uber.org/atomic"
2930
"golang.org/x/exp/slices"
@@ -63,6 +64,8 @@ type Options struct {
6364
SSHMaxTimeout time.Duration
6465
TailnetListenPort uint16
6566
Subsystem codersdk.AgentSubsystem
67+
68+
PrometheusRegistry *prometheus.Registry
6669
}
6770

6871
type Client interface {
@@ -102,6 +105,12 @@ func New(options Options) Agent {
102105
return "", nil
103106
}
104107
}
108+
109+
prometheusRegistry := options.PrometheusRegistry
110+
if prometheusRegistry == nil {
111+
prometheusRegistry = prometheus.NewRegistry()
112+
}
113+
105114
ctx, cancelFunc := context.WithCancel(context.Background())
106115
a := &agent{
107116
tailnetListenPort: options.TailnetListenPort,
@@ -121,6 +130,9 @@ func New(options Options) Agent {
121130
connStatsChan: make(chan *agentsdk.Stats, 1),
122131
sshMaxTimeout: options.SSHMaxTimeout,
123132
subsystem: options.Subsystem,
133+
134+
prometheusRegistry: prometheusRegistry,
135+
metrics: newAgentMetrics(prometheusRegistry),
124136
}
125137
a.init(ctx)
126138
return a
@@ -165,10 +177,13 @@ type agent struct {
165177
latestStat atomic.Pointer[agentsdk.Stats]
166178

167179
connCountReconnectingPTY atomic.Int64
180+
181+
prometheusRegistry *prometheus.Registry
182+
metrics *agentMetrics
168183
}
169184

170185
func (a *agent) init(ctx context.Context) {
171-
sshSrv, err := agentssh.NewServer(ctx, a.logger.Named("ssh-server"), a.filesystem, a.sshMaxTimeout, "")
186+
sshSrv, err := agentssh.NewServer(ctx, a.logger.Named("ssh-server"), a.prometheusRegistry, a.filesystem, a.sshMaxTimeout, "")
172187
if err != nil {
173188
panic(err)
174189
}
@@ -983,6 +998,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
983998

984999
func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) {
9851000
defer conn.Close()
1001+
a.metrics.connectionsTotal.Add(1)
9861002

9871003
a.connCountReconnectingPTY.Add(1)
9881004
defer a.connCountReconnectingPTY.Add(-1)
@@ -1022,6 +1038,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10221038
// Empty command will default to the users shell!
10231039
cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil)
10241040
if err != nil {
1041+
a.metrics.reconnectingPTYErrors.WithLabelValues("create_command").Add(1)
10251042
return xerrors.Errorf("create command: %w", err)
10261043
}
10271044
cmd.Env = append(cmd.Env, "TERM=xterm-256color")
@@ -1034,6 +1051,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10341051

10351052
ptty, process, err := pty.Start(cmd)
10361053
if err != nil {
1054+
a.metrics.reconnectingPTYErrors.WithLabelValues("start_command").Add(1)
10371055
return xerrors.Errorf("start command: %w", err)
10381056
}
10391057

@@ -1060,7 +1078,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10601078
if err != nil {
10611079
// When the PTY is closed, this is triggered.
10621080
// Error is typically a benign EOF, so only log for debugging.
1063-
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
1081+
if errors.Is(err, io.EOF) {
1082+
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
1083+
} else {
1084+
logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err))
1085+
a.metrics.reconnectingPTYErrors.WithLabelValues("output_reader").Add(1)
1086+
}
10641087
break
10651088
}
10661089
part := buffer[:read]
@@ -1075,11 +1098,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10751098
for cid, conn := range rpty.activeConns {
10761099
_, err = conn.Write(part)
10771100
if err != nil {
1078-
logger.Debug(ctx,
1101+
logger.Warn(ctx,
10791102
"error writing to active conn",
10801103
slog.F("other_conn_id", cid),
10811104
slog.Error(err),
10821105
)
1106+
a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1)
10831107
}
10841108
}
10851109
rpty.activeConnsMutex.Unlock()
@@ -1099,6 +1123,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10991123
if err != nil {
11001124
// We can continue after this, it's not fatal!
11011125
logger.Error(ctx, "resize", slog.Error(err))
1126+
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
11021127
}
11031128
// Write any previously stored data for the TTY.
11041129
rpty.circularBufferMutex.RLock()
@@ -1111,6 +1136,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11111136
// while also holding circularBufferMutex seems dangerous.
11121137
_, err = conn.Write(prevBuf)
11131138
if err != nil {
1139+
a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1)
11141140
return xerrors.Errorf("write buffer to conn: %w", err)
11151141
}
11161142
// Multiple connections to the same TTY are permitted.
@@ -1161,6 +1187,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11611187
_, err = rpty.ptty.InputWriter().Write([]byte(req.Data))
11621188
if err != nil {
11631189
logger.Warn(ctx, "write to pty", slog.Error(err))
1190+
a.metrics.reconnectingPTYErrors.WithLabelValues("input_writer").Add(1)
11641191
return nil
11651192
}
11661193
// Check if a resize needs to happen!
@@ -1171,6 +1198,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11711198
if err != nil {
11721199
// We can continue after this, it's not fatal!
11731200
logger.Error(ctx, "resize", slog.Error(err))
1201+
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
11741202
}
11751203
}
11761204
}
@@ -1203,7 +1231,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12031231
var mu sync.Mutex
12041232
status := a.network.Status()
12051233
durations := []float64{}
1206-
ctx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
1234+
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
12071235
defer cancelFunc()
12081236
for nodeID, peer := range status.Peer {
12091237
if !peer.Active {
@@ -1219,7 +1247,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12191247
wg.Add(1)
12201248
go func() {
12211249
defer wg.Done()
1222-
duration, _, _, err := a.network.Ping(ctx, addresses[0].Addr())
1250+
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
12231251
if err != nil {
12241252
return
12251253
}
@@ -1244,7 +1272,10 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12441272
// Collect agent metrics.
12451273
// Agent metrics are changing all the time, so there is no need to perform
12461274
// reflect.DeepEqual to see if stats should be transferred.
1247-
stats.Metrics = collectMetrics()
1275+
1276+
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
1277+
defer cancelFunc()
1278+
stats.Metrics = a.collectMetrics(metricsCtx)
12481279

12491280
a.latestStat.Store(stats)
12501281

agent/agent_test.go

+119-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import (
2727
"github.com/google/uuid"
2828
"github.com/pion/udp"
2929
"github.com/pkg/sftp"
30+
"github.com/prometheus/client_golang/prometheus"
31+
promgo "github.com/prometheus/client_model/go"
3032
"github.com/spf13/afero"
3133
"github.com/stretchr/testify/assert"
3234
"github.com/stretchr/testify/require"
@@ -1724,7 +1726,7 @@ func (c closeFunc) Close() error {
17241726
return c()
17251727
}
17261728

1727-
func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Duration) (
1729+
func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Duration, opts ...func(agent.Options) agent.Options) (
17281730
*codersdk.WorkspaceAgentConn,
17291731
*client,
17301732
<-chan *agentsdk.Stats,
@@ -1749,12 +1751,19 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
17491751
statsChan: statsCh,
17501752
coordinator: coordinator,
17511753
}
1752-
closer := agent.New(agent.Options{
1754+
1755+
options := agent.Options{
17531756
Client: c,
17541757
Filesystem: fs,
17551758
Logger: logger.Named("agent"),
17561759
ReconnectingPTYTimeout: ptyTimeout,
1757-
})
1760+
}
1761+
1762+
for _, opt := range opts {
1763+
options = opt(options)
1764+
}
1765+
1766+
closer := agent.New(options)
17581767
t.Cleanup(func() {
17591768
_ = closer.Close()
17601769
})
@@ -1979,3 +1988,110 @@ func tempDirUnixSocket(t *testing.T) string {
19791988

19801989
return t.TempDir()
19811990
}
1991+
1992+
func TestAgent_Metrics_SSH(t *testing.T) {
1993+
t.Parallel()
1994+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
1995+
defer cancel()
1996+
1997+
registry := prometheus.NewRegistry()
1998+
1999+
//nolint:dogsled
2000+
conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0, func(o agent.Options) agent.Options {
2001+
o.PrometheusRegistry = registry
2002+
return o
2003+
})
2004+
2005+
sshClient, err := conn.SSHClient(ctx)
2006+
require.NoError(t, err)
2007+
defer sshClient.Close()
2008+
session, err := sshClient.NewSession()
2009+
require.NoError(t, err)
2010+
defer session.Close()
2011+
stdin, err := session.StdinPipe()
2012+
require.NoError(t, err)
2013+
err = session.Shell()
2014+
require.NoError(t, err)
2015+
2016+
expected := []agentsdk.AgentMetric{
2017+
{
2018+
Name: "agent_reconnecting_pty_connections_total",
2019+
Type: agentsdk.AgentMetricTypeCounter,
2020+
Value: 0,
2021+
},
2022+
{
2023+
Name: "agent_sessions_total",
2024+
Type: agentsdk.AgentMetricTypeCounter,
2025+
Value: 1,
2026+
Labels: []agentsdk.AgentMetricLabel{
2027+
{
2028+
Name: "magic_type",
2029+
Value: "ssh",
2030+
},
2031+
{
2032+
Name: "pty",
2033+
Value: "no",
2034+
},
2035+
},
2036+
},
2037+
{
2038+
Name: "agent_ssh_server_failed_connections_total",
2039+
Type: agentsdk.AgentMetricTypeCounter,
2040+
Value: 0,
2041+
},
2042+
{
2043+
Name: "agent_ssh_server_sftp_connections_total",
2044+
Type: agentsdk.AgentMetricTypeCounter,
2045+
Value: 0,
2046+
},
2047+
{
2048+
Name: "agent_ssh_server_sftp_server_errors_total",
2049+
Type: agentsdk.AgentMetricTypeCounter,
2050+
Value: 0,
2051+
},
2052+
}
2053+
2054+
var actual []*promgo.MetricFamily
2055+
assert.Eventually(t, func() bool {
2056+
actual, err = registry.Gather()
2057+
if err != nil {
2058+
return false
2059+
}
2060+
2061+
if len(expected) != len(actual) {
2062+
return false
2063+
}
2064+
2065+
return verifyCollectedMetrics(t, expected, actual)
2066+
}, testutil.WaitLong, testutil.IntervalFast)
2067+
2068+
require.Len(t, actual, len(expected))
2069+
collected := verifyCollectedMetrics(t, expected, actual)
2070+
require.True(t, collected, "expected metrics were not collected")
2071+
2072+
_ = stdin.Close()
2073+
err = session.Wait()
2074+
require.NoError(t, err)
2075+
}
2076+
2077+
func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
2078+
t.Helper()
2079+
2080+
for i, e := range expected {
2081+
assert.Equal(t, e.Name, actual[i].GetName())
2082+
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))
2083+
2084+
for _, m := range actual[i].GetMetric() {
2085+
assert.Equal(t, e.Value, m.Counter.GetValue())
2086+
2087+
if len(m.GetLabel()) > 0 {
2088+
for j, lbl := range m.GetLabel() {
2089+
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
2090+
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
2091+
}
2092+
}
2093+
m.GetLabel()
2094+
}
2095+
}
2096+
return true
2097+
}

0 commit comments

Comments
 (0)