Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6bc1f95

Browse files
committed
feat(agent): add connection reporting for SSH and reconnecting PTY (#16652)
Updates #15139 (cherry picked from commit 4ba5a8a)
1 parent 03b5012 commit 6bc1f95

File tree

7 files changed

+370
-32
lines changed

7 files changed

+370
-32
lines changed

agent/agent.go

+155
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"hash/fnv"
1010
"io"
11+
"net"
1112
"net/http"
1213
"net/netip"
1314
"os"
@@ -28,6 +29,7 @@ import (
2829
"golang.org/x/exp/slices"
2930
"golang.org/x/sync/errgroup"
3031
"golang.org/x/xerrors"
32+
"google.golang.org/protobuf/types/known/timestamppb"
3133
"tailscale.com/net/speedtest"
3234
"tailscale.com/tailcfg"
3335
"tailscale.com/types/netlogtype"
@@ -175,6 +177,7 @@ func New(options Options) Agent {
175177
lifecycleUpdate: make(chan struct{}, 1),
176178
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
177179
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
180+
reportConnectionsUpdate: make(chan struct{}, 1),
178181
ignorePorts: options.IgnorePorts,
179182
portCacheDuration: options.PortCacheDuration,
180183
reportMetadataInterval: options.ReportMetadataInterval,
@@ -248,6 +251,10 @@ type agent struct {
248251
lifecycleStates []agentsdk.PostLifecycleRequest
249252
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.
250253

254+
reportConnectionsUpdate chan struct{}
255+
reportConnectionsMu sync.Mutex
256+
reportConnections []*proto.ReportConnectionRequest
257+
251258
network *tailnet.Conn
252259
statsReporter *statsReporter
253260
logSender *agentsdk.LogSender
@@ -273,6 +280,24 @@ func (a *agent) init() {
273280
UpdateEnv: a.updateCommandEnv,
274281
WorkingDirectory: func() string { return a.manifest.Load().Directory },
275282
BlockFileTransfer: a.blockFileTransfer,
283+
ReportConnection: func(id uuid.UUID, magicType agentssh.MagicSessionType, ip string) func(code int, reason string) {
284+
var connectionType proto.Connection_Type
285+
switch magicType {
286+
case agentssh.MagicSessionTypeSSH:
287+
connectionType = proto.Connection_SSH
288+
case agentssh.MagicSessionTypeVSCode:
289+
connectionType = proto.Connection_VSCODE
290+
case agentssh.MagicSessionTypeJetBrains:
291+
connectionType = proto.Connection_JETBRAINS
292+
case agentssh.MagicSessionTypeUnknown:
293+
connectionType = proto.Connection_TYPE_UNSPECIFIED
294+
default:
295+
a.logger.Error(a.hardCtx, "unhandled magic session type when reporting connection", slog.F("magic_type", magicType))
296+
connectionType = proto.Connection_TYPE_UNSPECIFIED
297+
}
298+
299+
return a.reportConnection(id, connectionType, ip)
300+
},
276301
})
277302
if err != nil {
278303
panic(err)
@@ -295,6 +320,9 @@ func (a *agent) init() {
295320
a.reconnectingPTYServer = reconnectingpty.NewServer(
296321
a.logger.Named("reconnecting-pty"),
297322
a.sshServer,
323+
func(id uuid.UUID, ip string) func(code int, reason string) {
324+
return a.reportConnection(id, proto.Connection_RECONNECTING_PTY, ip)
325+
},
298326
a.metrics.connectionsTotal, a.metrics.reconnectingPTYErrors,
299327
a.reconnectingPTYTimeout,
300328
)
@@ -704,6 +732,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704732
}
705733
}
706734

735+
// reportConnectionsLoop reports connections to the agent for auditing.
736+
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
737+
for {
738+
select {
739+
case <-a.reportConnectionsUpdate:
740+
case <-ctx.Done():
741+
return ctx.Err()
742+
}
743+
744+
for {
745+
a.reportConnectionsMu.Lock()
746+
if len(a.reportConnections) == 0 {
747+
a.reportConnectionsMu.Unlock()
748+
break
749+
}
750+
payload := a.reportConnections[0]
751+
// Release lock while we send the payload, this is safe
752+
// since we only append to the slice.
753+
a.reportConnectionsMu.Unlock()
754+
755+
logger := a.logger.With(slog.F("payload", payload))
756+
logger.Debug(ctx, "reporting connection")
757+
_, err := aAPI.ReportConnection(ctx, payload)
758+
if err != nil {
759+
return xerrors.Errorf("failed to report connection: %w", err)
760+
}
761+
762+
logger.Debug(ctx, "successfully reported connection")
763+
764+
// Remove the payload we sent.
765+
a.reportConnectionsMu.Lock()
766+
a.reportConnections[0] = nil // Release the pointer from the underlying array.
767+
a.reportConnections = a.reportConnections[1:]
768+
a.reportConnectionsMu.Unlock()
769+
}
770+
}
771+
}
772+
773+
const (
774+
// reportConnectionBufferLimit limits the number of connection reports we
775+
// buffer to avoid growing the buffer indefinitely. This should not happen
776+
// unless the agent has lost connection to coderd for a long time or if
777+
// the agent is being spammed with connections.
778+
//
779+
// If we assume ~150 byte per connection report, this would be around 300KB
780+
// of memory which seems acceptable. We could reduce this if necessary by
781+
// not using the proto struct directly.
782+
reportConnectionBufferLimit = 2048
783+
)
784+
785+
func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_Type, ip string) (disconnected func(code int, reason string)) {
786+
// If the experiment hasn't been enabled, we don't report connections.
787+
if !a.experimentalConnectionReports {
788+
return func(int, string) {} // Noop.
789+
}
790+
791+
// Remove the port from the IP because ports are not supported in coderd.
792+
if host, _, err := net.SplitHostPort(ip); err != nil {
793+
a.logger.Error(a.hardCtx, "split host and port for connection report failed", slog.F("ip", ip), slog.Error(err))
794+
} else {
795+
// Best effort.
796+
ip = host
797+
}
798+
799+
a.reportConnectionsMu.Lock()
800+
defer a.reportConnectionsMu.Unlock()
801+
802+
if len(a.reportConnections) >= reportConnectionBufferLimit {
803+
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping connect",
804+
slog.F("limit", reportConnectionBufferLimit),
805+
slog.F("connection_id", id),
806+
slog.F("connection_type", connectionType),
807+
slog.F("ip", ip),
808+
)
809+
} else {
810+
a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
811+
Connection: &proto.Connection{
812+
Id: id[:],
813+
Action: proto.Connection_CONNECT,
814+
Type: connectionType,
815+
Timestamp: timestamppb.New(time.Now()),
816+
Ip: ip,
817+
StatusCode: 0,
818+
Reason: nil,
819+
},
820+
})
821+
select {
822+
case a.reportConnectionsUpdate <- struct{}{}:
823+
default:
824+
}
825+
}
826+
827+
return func(code int, reason string) {
828+
a.reportConnectionsMu.Lock()
829+
defer a.reportConnectionsMu.Unlock()
830+
if len(a.reportConnections) >= reportConnectionBufferLimit {
831+
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping disconnect",
832+
slog.F("limit", reportConnectionBufferLimit),
833+
slog.F("connection_id", id),
834+
slog.F("connection_type", connectionType),
835+
slog.F("ip", ip),
836+
)
837+
return
838+
}
839+
840+
a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
841+
Connection: &proto.Connection{
842+
Id: id[:],
843+
Action: proto.Connection_DISCONNECT,
844+
Type: connectionType,
845+
Timestamp: timestamppb.New(time.Now()),
846+
Ip: ip,
847+
StatusCode: int32(code), //nolint:gosec
848+
Reason: &reason,
849+
},
850+
})
851+
select {
852+
case a.reportConnectionsUpdate <- struct{}{}:
853+
default:
854+
}
855+
}
856+
}
857+
707858
// fetchServiceBannerLoop fetches the service banner on an interval. It will
708859
// not be fetched immediately; the expectation is that it is primed elsewhere
709860
// (and must be done before the session actually starts).
@@ -814,6 +965,10 @@ func (a *agent) run() (retErr error) {
814965
return resourcesmonitor.Start(ctx)
815966
})
816967

968+
// Connection reports are part of auditing, we should keep sending them via
969+
// gracefulShutdownBehaviorRemain.
970+
connMan.startAgentAPI("report connections", gracefulShutdownBehaviorRemain, a.reportConnectionsLoop)
971+
817972
// channels to sync goroutines below
818973
// handle manifest
819974
// |

0 commit comments

Comments
 (0)