88 "fmt"
99 "hash/fnv"
1010 "io"
11+ "net"
1112 "net/http"
1213 "net/netip"
1314 "os"
@@ -28,6 +29,7 @@ import (
2829 "golang.org/x/exp/slices"
2930 "golang.org/x/sync/errgroup"
3031 "golang.org/x/xerrors"
32+ "google.golang.org/protobuf/types/known/timestamppb"
3133 "tailscale.com/net/speedtest"
3234 "tailscale.com/tailcfg"
3335 "tailscale.com/types/netlogtype"
@@ -88,6 +90,8 @@ type Options struct {
8890 BlockFileTransfer bool
8991 Execer agentexec.Execer
9092 ContainerLister agentcontainers.Lister
93+
94+ ExperimentalConnectionReports bool
9195}
9296
9397type Client interface {
@@ -175,6 +179,7 @@ func New(options Options) Agent {
175179 lifecycleUpdate : make (chan struct {}, 1 ),
176180 lifecycleReported : make (chan codersdk.WorkspaceAgentLifecycle , 1 ),
177181 lifecycleStates : []agentsdk.PostLifecycleRequest {{State : codersdk .WorkspaceAgentLifecycleCreated }},
182+ reportConnectionsUpdate : make (chan struct {}, 1 ),
178183 ignorePorts : options .IgnorePorts ,
179184 portCacheDuration : options .PortCacheDuration ,
180185 reportMetadataInterval : options .ReportMetadataInterval ,
@@ -188,6 +193,8 @@ func New(options Options) Agent {
188193 metrics : newAgentMetrics (prometheusRegistry ),
189194 execer : options .Execer ,
190195 lister : options .ContainerLister ,
196+
197+ experimentalConnectionReports : options .ExperimentalConnectionReports ,
191198 }
192199 // Initially, we have a closed channel, reflecting the fact that we are not initially connected.
193200 // Each time we connect we replace the channel (while holding the closeMutex) with a new one
@@ -248,6 +255,10 @@ type agent struct {
248255 lifecycleStates []agentsdk.PostLifecycleRequest
249256 lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.
250257
258+ reportConnectionsUpdate chan struct {}
259+ reportConnectionsMu sync.Mutex
260+ reportConnections []* proto.ReportConnectionRequest
261+
251262 network * tailnet.Conn
252263 statsReporter * statsReporter
253264 logSender * agentsdk.LogSender
@@ -258,6 +269,8 @@ type agent struct {
258269 metrics * agentMetrics
259270 execer agentexec.Execer
260271 lister agentcontainers.Lister
272+
273+ experimentalConnectionReports bool
261274}
262275
263276func (a * agent ) TailnetConn () * tailnet.Conn {
@@ -273,6 +286,24 @@ func (a *agent) init() {
273286 UpdateEnv : a .updateCommandEnv ,
274287 WorkingDirectory : func () string { return a .manifest .Load ().Directory },
275288 BlockFileTransfer : a .blockFileTransfer ,
289+ ReportConnection : func (id uuid.UUID , magicType agentssh.MagicSessionType , ip string ) func (code int , reason string ) {
290+ var connectionType proto.Connection_Type
291+ switch magicType {
292+ case agentssh .MagicSessionTypeSSH :
293+ connectionType = proto .Connection_SSH
294+ case agentssh .MagicSessionTypeVSCode :
295+ connectionType = proto .Connection_VSCODE
296+ case agentssh .MagicSessionTypeJetBrains :
297+ connectionType = proto .Connection_JETBRAINS
298+ case agentssh .MagicSessionTypeUnknown :
299+ connectionType = proto .Connection_TYPE_UNSPECIFIED
300+ default :
301+ a .logger .Error (a .hardCtx , "unhandled magic session type when reporting connection" , slog .F ("magic_type" , magicType ))
302+ connectionType = proto .Connection_TYPE_UNSPECIFIED
303+ }
304+
305+ return a .reportConnection (id , connectionType , ip )
306+ },
276307 })
277308 if err != nil {
278309 panic (err )
@@ -295,6 +326,9 @@ func (a *agent) init() {
295326 a .reconnectingPTYServer = reconnectingpty .NewServer (
296327 a .logger .Named ("reconnecting-pty" ),
297328 a .sshServer ,
329+ func (id uuid.UUID , ip string ) func (code int , reason string ) {
330+ return a .reportConnection (id , proto .Connection_RECONNECTING_PTY , ip )
331+ },
298332 a .metrics .connectionsTotal , a .metrics .reconnectingPTYErrors ,
299333 a .reconnectingPTYTimeout ,
300334 )
@@ -704,6 +738,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704738 }
705739}
706740
741+ // reportConnectionsLoop reports connections to the agent for auditing.
742+ func (a * agent ) reportConnectionsLoop (ctx context.Context , aAPI proto.DRPCAgentClient24 ) error {
743+ for {
744+ select {
745+ case <- a .reportConnectionsUpdate :
746+ case <- ctx .Done ():
747+ return ctx .Err ()
748+ }
749+
750+ for {
751+ a .reportConnectionsMu .Lock ()
752+ if len (a .reportConnections ) == 0 {
753+ a .reportConnectionsMu .Unlock ()
754+ break
755+ }
756+ payload := a .reportConnections [0 ]
757+ // Release lock while we send the payload, this is safe
758+ // since we only append to the slice.
759+ a .reportConnectionsMu .Unlock ()
760+
761+ logger := a .logger .With (slog .F ("payload" , payload ))
762+ logger .Debug (ctx , "reporting connection" )
763+ _ , err := aAPI .ReportConnection (ctx , payload )
764+ if err != nil {
765+ return xerrors .Errorf ("failed to report connection: %w" , err )
766+ }
767+
768+ logger .Debug (ctx , "successfully reported connection" )
769+
770+ // Remove the payload we sent.
771+ a .reportConnectionsMu .Lock ()
772+ a .reportConnections [0 ] = nil // Release the pointer from the underlying array.
773+ a .reportConnections = a .reportConnections [1 :]
774+ a .reportConnectionsMu .Unlock ()
775+ }
776+ }
777+ }
778+
779+ const (
780+ // reportConnectionBufferLimit limits the number of connection reports we
781+ // buffer to avoid growing the buffer indefinitely. This should not happen
782+ // unless the agent has lost connection to coderd for a long time or if
783+ // the agent is being spammed with connections.
784+ //
785+ // If we assume ~150 byte per connection report, this would be around 300KB
786+ // of memory which seems acceptable. We could reduce this if necessary by
787+ // not using the proto struct directly.
788+ reportConnectionBufferLimit = 2048
789+ )
790+
791+ func (a * agent ) reportConnection (id uuid.UUID , connectionType proto.Connection_Type , ip string ) (disconnected func (code int , reason string )) {
792+ // If the experiment hasn't been enabled, we don't report connections.
793+ if ! a .experimentalConnectionReports {
794+ return func (int , string ) {} // Noop.
795+ }
796+
797+ // Remove the port from the IP because ports are not supported in coderd.
798+ if host , _ , err := net .SplitHostPort (ip ); err != nil {
799+ a .logger .Error (a .hardCtx , "split host and port for connection report failed" , slog .F ("ip" , ip ), slog .Error (err ))
800+ } else {
801+ // Best effort.
802+ ip = host
803+ }
804+
805+ a .reportConnectionsMu .Lock ()
806+ defer a .reportConnectionsMu .Unlock ()
807+
808+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
809+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping connect" ,
810+ slog .F ("limit" , reportConnectionBufferLimit ),
811+ slog .F ("connection_id" , id ),
812+ slog .F ("connection_type" , connectionType ),
813+ slog .F ("ip" , ip ),
814+ )
815+ } else {
816+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
817+ Connection : & proto.Connection {
818+ Id : id [:],
819+ Action : proto .Connection_CONNECT ,
820+ Type : connectionType ,
821+ Timestamp : timestamppb .New (time .Now ()),
822+ Ip : ip ,
823+ StatusCode : 0 ,
824+ Reason : nil ,
825+ },
826+ })
827+ select {
828+ case a .reportConnectionsUpdate <- struct {}{}:
829+ default :
830+ }
831+ }
832+
833+ return func (code int , reason string ) {
834+ a .reportConnectionsMu .Lock ()
835+ defer a .reportConnectionsMu .Unlock ()
836+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
837+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping disconnect" ,
838+ slog .F ("limit" , reportConnectionBufferLimit ),
839+ slog .F ("connection_id" , id ),
840+ slog .F ("connection_type" , connectionType ),
841+ slog .F ("ip" , ip ),
842+ )
843+ return
844+ }
845+
846+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
847+ Connection : & proto.Connection {
848+ Id : id [:],
849+ Action : proto .Connection_DISCONNECT ,
850+ Type : connectionType ,
851+ Timestamp : timestamppb .New (time .Now ()),
852+ Ip : ip ,
853+ StatusCode : int32 (code ), //nolint:gosec
854+ Reason : & reason ,
855+ },
856+ })
857+ select {
858+ case a .reportConnectionsUpdate <- struct {}{}:
859+ default :
860+ }
861+ }
862+ }
863+
707864// fetchServiceBannerLoop fetches the service banner on an interval. It will
708865// not be fetched immediately; the expectation is that it is primed elsewhere
709866// (and must be done before the session actually starts).
@@ -814,6 +971,10 @@ func (a *agent) run() (retErr error) {
814971 return resourcesmonitor .Start (ctx )
815972 })
816973
974+ // Connection reports are part of auditing, we should keep sending them via
975+ // gracefulShutdownBehaviorRemain.
976+ connMan .startAgentAPI ("report connections" , gracefulShutdownBehaviorRemain , a .reportConnectionsLoop )
977+
817978 // channels to sync goroutines below
818979 // handle manifest
819980 // |
0 commit comments