Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 97481a3

Browse files
committed
feat(agent): add connection reporting for SSH and reconnecting PTY (#16652)
Updates #15139 (cherry picked from commit 4ba5a8a)
1 parent 03b5012 commit 97481a3

File tree

7 files changed

+386
-32
lines changed

7 files changed

+386
-32
lines changed

agent/agent.go

+161
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"hash/fnv"
1010
"io"
11+
"net"
1112
"net/http"
1213
"net/netip"
1314
"os"
@@ -28,6 +29,7 @@ import (
2829
"golang.org/x/exp/slices"
2930
"golang.org/x/sync/errgroup"
3031
"golang.org/x/xerrors"
32+
"google.golang.org/protobuf/types/known/timestamppb"
3133
"tailscale.com/net/speedtest"
3234
"tailscale.com/tailcfg"
3335
"tailscale.com/types/netlogtype"
@@ -88,6 +90,8 @@ type Options struct {
8890
BlockFileTransfer bool
8991
Execer agentexec.Execer
9092
ContainerLister agentcontainers.Lister
93+
94+
ExperimentalConnectionReports bool
9195
}
9296

9397
type Client interface {
@@ -175,6 +179,7 @@ func New(options Options) Agent {
175179
lifecycleUpdate: make(chan struct{}, 1),
176180
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
177181
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
182+
reportConnectionsUpdate: make(chan struct{}, 1),
178183
ignorePorts: options.IgnorePorts,
179184
portCacheDuration: options.PortCacheDuration,
180185
reportMetadataInterval: options.ReportMetadataInterval,
@@ -188,6 +193,8 @@ func New(options Options) Agent {
188193
metrics: newAgentMetrics(prometheusRegistry),
189194
execer: options.Execer,
190195
lister: options.ContainerLister,
196+
197+
experimentalConnectionReports: options.ExperimentalConnectionReports,
191198
}
192199
// Initially, we have a closed channel, reflecting the fact that we are not initially connected.
193200
// Each time we connect we replace the channel (while holding the closeMutex) with a new one
@@ -248,6 +255,10 @@ type agent struct {
248255
lifecycleStates []agentsdk.PostLifecycleRequest
249256
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.
250257

258+
reportConnectionsUpdate chan struct{}
259+
reportConnectionsMu sync.Mutex
260+
reportConnections []*proto.ReportConnectionRequest
261+
251262
network *tailnet.Conn
252263
statsReporter *statsReporter
253264
logSender *agentsdk.LogSender
@@ -258,6 +269,8 @@ type agent struct {
258269
metrics *agentMetrics
259270
execer agentexec.Execer
260271
lister agentcontainers.Lister
272+
273+
experimentalConnectionReports bool
261274
}
262275

263276
func (a *agent) TailnetConn() *tailnet.Conn {
@@ -273,6 +286,24 @@ func (a *agent) init() {
273286
UpdateEnv: a.updateCommandEnv,
274287
WorkingDirectory: func() string { return a.manifest.Load().Directory },
275288
BlockFileTransfer: a.blockFileTransfer,
289+
ReportConnection: func(id uuid.UUID, magicType agentssh.MagicSessionType, ip string) func(code int, reason string) {
290+
var connectionType proto.Connection_Type
291+
switch magicType {
292+
case agentssh.MagicSessionTypeSSH:
293+
connectionType = proto.Connection_SSH
294+
case agentssh.MagicSessionTypeVSCode:
295+
connectionType = proto.Connection_VSCODE
296+
case agentssh.MagicSessionTypeJetBrains:
297+
connectionType = proto.Connection_JETBRAINS
298+
case agentssh.MagicSessionTypeUnknown:
299+
connectionType = proto.Connection_TYPE_UNSPECIFIED
300+
default:
301+
a.logger.Error(a.hardCtx, "unhandled magic session type when reporting connection", slog.F("magic_type", magicType))
302+
connectionType = proto.Connection_TYPE_UNSPECIFIED
303+
}
304+
305+
return a.reportConnection(id, connectionType, ip)
306+
},
276307
})
277308
if err != nil {
278309
panic(err)
@@ -295,6 +326,9 @@ func (a *agent) init() {
295326
a.reconnectingPTYServer = reconnectingpty.NewServer(
296327
a.logger.Named("reconnecting-pty"),
297328
a.sshServer,
329+
func(id uuid.UUID, ip string) func(code int, reason string) {
330+
return a.reportConnection(id, proto.Connection_RECONNECTING_PTY, ip)
331+
},
298332
a.metrics.connectionsTotal, a.metrics.reconnectingPTYErrors,
299333
a.reconnectingPTYTimeout,
300334
)
@@ -704,6 +738,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704738
}
705739
}
706740

741+
// reportConnectionsLoop reports connections to the agent for auditing.
742+
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
743+
for {
744+
select {
745+
case <-a.reportConnectionsUpdate:
746+
case <-ctx.Done():
747+
return ctx.Err()
748+
}
749+
750+
for {
751+
a.reportConnectionsMu.Lock()
752+
if len(a.reportConnections) == 0 {
753+
a.reportConnectionsMu.Unlock()
754+
break
755+
}
756+
payload := a.reportConnections[0]
757+
// Release lock while we send the payload, this is safe
758+
// since we only append to the slice.
759+
a.reportConnectionsMu.Unlock()
760+
761+
logger := a.logger.With(slog.F("payload", payload))
762+
logger.Debug(ctx, "reporting connection")
763+
_, err := aAPI.ReportConnection(ctx, payload)
764+
if err != nil {
765+
return xerrors.Errorf("failed to report connection: %w", err)
766+
}
767+
768+
logger.Debug(ctx, "successfully reported connection")
769+
770+
// Remove the payload we sent.
771+
a.reportConnectionsMu.Lock()
772+
a.reportConnections[0] = nil // Release the pointer from the underlying array.
773+
a.reportConnections = a.reportConnections[1:]
774+
a.reportConnectionsMu.Unlock()
775+
}
776+
}
777+
}
778+
779+
const (
780+
// reportConnectionBufferLimit limits the number of connection reports we
781+
// buffer to avoid growing the buffer indefinitely. This should not happen
782+
// unless the agent has lost connection to coderd for a long time or if
783+
// the agent is being spammed with connections.
784+
//
785+
// If we assume ~150 byte per connection report, this would be around 300KB
786+
// of memory which seems acceptable. We could reduce this if necessary by
787+
// not using the proto struct directly.
788+
reportConnectionBufferLimit = 2048
789+
)
790+
791+
func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_Type, ip string) (disconnected func(code int, reason string)) {
792+
// If the experiment hasn't been enabled, we don't report connections.
793+
if !a.experimentalConnectionReports {
794+
return func(int, string) {} // Noop.
795+
}
796+
797+
// Remove the port from the IP because ports are not supported in coderd.
798+
if host, _, err := net.SplitHostPort(ip); err != nil {
799+
a.logger.Error(a.hardCtx, "split host and port for connection report failed", slog.F("ip", ip), slog.Error(err))
800+
} else {
801+
// Best effort.
802+
ip = host
803+
}
804+
805+
a.reportConnectionsMu.Lock()
806+
defer a.reportConnectionsMu.Unlock()
807+
808+
if len(a.reportConnections) >= reportConnectionBufferLimit {
809+
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping connect",
810+
slog.F("limit", reportConnectionBufferLimit),
811+
slog.F("connection_id", id),
812+
slog.F("connection_type", connectionType),
813+
slog.F("ip", ip),
814+
)
815+
} else {
816+
a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
817+
Connection: &proto.Connection{
818+
Id: id[:],
819+
Action: proto.Connection_CONNECT,
820+
Type: connectionType,
821+
Timestamp: timestamppb.New(time.Now()),
822+
Ip: ip,
823+
StatusCode: 0,
824+
Reason: nil,
825+
},
826+
})
827+
select {
828+
case a.reportConnectionsUpdate <- struct{}{}:
829+
default:
830+
}
831+
}
832+
833+
return func(code int, reason string) {
834+
a.reportConnectionsMu.Lock()
835+
defer a.reportConnectionsMu.Unlock()
836+
if len(a.reportConnections) >= reportConnectionBufferLimit {
837+
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping disconnect",
838+
slog.F("limit", reportConnectionBufferLimit),
839+
slog.F("connection_id", id),
840+
slog.F("connection_type", connectionType),
841+
slog.F("ip", ip),
842+
)
843+
return
844+
}
845+
846+
a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
847+
Connection: &proto.Connection{
848+
Id: id[:],
849+
Action: proto.Connection_DISCONNECT,
850+
Type: connectionType,
851+
Timestamp: timestamppb.New(time.Now()),
852+
Ip: ip,
853+
StatusCode: int32(code), //nolint:gosec
854+
Reason: &reason,
855+
},
856+
})
857+
select {
858+
case a.reportConnectionsUpdate <- struct{}{}:
859+
default:
860+
}
861+
}
862+
}
863+
707864
// fetchServiceBannerLoop fetches the service banner on an interval. It will
708865
// not be fetched immediately; the expectation is that it is primed elsewhere
709866
// (and must be done before the session actually starts).
@@ -814,6 +971,10 @@ func (a *agent) run() (retErr error) {
814971
return resourcesmonitor.Start(ctx)
815972
})
816973

974+
// Connection reports are part of auditing, we should keep sending them via
975+
// gracefulShutdownBehaviorRemain.
976+
connMan.startAgentAPI("report connections", gracefulShutdownBehaviorRemain, a.reportConnectionsLoop)
977+
817978
// channels to sync goroutines below
818979
// handle manifest
819980
// |

0 commit comments

Comments
 (0)