8
8
"fmt"
9
9
"hash/fnv"
10
10
"io"
11
+ "net"
11
12
"net/http"
12
13
"net/netip"
13
14
"os"
@@ -28,6 +29,7 @@ import (
28
29
"golang.org/x/exp/slices"
29
30
"golang.org/x/sync/errgroup"
30
31
"golang.org/x/xerrors"
32
+ "google.golang.org/protobuf/types/known/timestamppb"
31
33
"tailscale.com/net/speedtest"
32
34
"tailscale.com/tailcfg"
33
35
"tailscale.com/types/netlogtype"
@@ -175,6 +177,7 @@ func New(options Options) Agent {
175
177
lifecycleUpdate : make (chan struct {}, 1 ),
176
178
lifecycleReported : make (chan codersdk.WorkspaceAgentLifecycle , 1 ),
177
179
lifecycleStates : []agentsdk.PostLifecycleRequest {{State : codersdk .WorkspaceAgentLifecycleCreated }},
180
+ reportConnectionsUpdate : make (chan struct {}, 1 ),
178
181
ignorePorts : options .IgnorePorts ,
179
182
portCacheDuration : options .PortCacheDuration ,
180
183
reportMetadataInterval : options .ReportMetadataInterval ,
@@ -248,6 +251,10 @@ type agent struct {
248
251
lifecycleStates []agentsdk.PostLifecycleRequest
249
252
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.
250
253
254
+ reportConnectionsUpdate chan struct {}
255
+ reportConnectionsMu sync.Mutex
256
+ reportConnections []* proto.ReportConnectionRequest
257
+
251
258
network * tailnet.Conn
252
259
statsReporter * statsReporter
253
260
logSender * agentsdk.LogSender
@@ -273,6 +280,24 @@ func (a *agent) init() {
273
280
UpdateEnv : a .updateCommandEnv ,
274
281
WorkingDirectory : func () string { return a .manifest .Load ().Directory },
275
282
BlockFileTransfer : a .blockFileTransfer ,
283
+ ReportConnection : func (id uuid.UUID , magicType agentssh.MagicSessionType , ip string ) func (code int , reason string ) {
284
+ var connectionType proto.Connection_Type
285
+ switch magicType {
286
+ case agentssh .MagicSessionTypeSSH :
287
+ connectionType = proto .Connection_SSH
288
+ case agentssh .MagicSessionTypeVSCode :
289
+ connectionType = proto .Connection_VSCODE
290
+ case agentssh .MagicSessionTypeJetBrains :
291
+ connectionType = proto .Connection_JETBRAINS
292
+ case agentssh .MagicSessionTypeUnknown :
293
+ connectionType = proto .Connection_TYPE_UNSPECIFIED
294
+ default :
295
+ a .logger .Error (a .hardCtx , "unhandled magic session type when reporting connection" , slog .F ("magic_type" , magicType ))
296
+ connectionType = proto .Connection_TYPE_UNSPECIFIED
297
+ }
298
+
299
+ return a .reportConnection (id , connectionType , ip )
300
+ },
276
301
})
277
302
if err != nil {
278
303
panic (err )
@@ -295,6 +320,9 @@ func (a *agent) init() {
295
320
a .reconnectingPTYServer = reconnectingpty .NewServer (
296
321
a .logger .Named ("reconnecting-pty" ),
297
322
a .sshServer ,
323
+ func (id uuid.UUID , ip string ) func (code int , reason string ) {
324
+ return a .reportConnection (id , proto .Connection_RECONNECTING_PTY , ip )
325
+ },
298
326
a .metrics .connectionsTotal , a .metrics .reconnectingPTYErrors ,
299
327
a .reconnectingPTYTimeout ,
300
328
)
@@ -704,6 +732,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704
732
}
705
733
}
706
734
735
+ // reportConnectionsLoop reports connections to the agent for auditing.
736
+ func (a * agent ) reportConnectionsLoop (ctx context.Context , aAPI proto.DRPCAgentClient24 ) error {
737
+ for {
738
+ select {
739
+ case <- a .reportConnectionsUpdate :
740
+ case <- ctx .Done ():
741
+ return ctx .Err ()
742
+ }
743
+
744
+ for {
745
+ a .reportConnectionsMu .Lock ()
746
+ if len (a .reportConnections ) == 0 {
747
+ a .reportConnectionsMu .Unlock ()
748
+ break
749
+ }
750
+ payload := a .reportConnections [0 ]
751
+ // Release lock while we send the payload, this is safe
752
+ // since we only append to the slice.
753
+ a .reportConnectionsMu .Unlock ()
754
+
755
+ logger := a .logger .With (slog .F ("payload" , payload ))
756
+ logger .Debug (ctx , "reporting connection" )
757
+ _ , err := aAPI .ReportConnection (ctx , payload )
758
+ if err != nil {
759
+ return xerrors .Errorf ("failed to report connection: %w" , err )
760
+ }
761
+
762
+ logger .Debug (ctx , "successfully reported connection" )
763
+
764
+ // Remove the payload we sent.
765
+ a .reportConnectionsMu .Lock ()
766
+ a .reportConnections [0 ] = nil // Release the pointer from the underlying array.
767
+ a .reportConnections = a .reportConnections [1 :]
768
+ a .reportConnectionsMu .Unlock ()
769
+ }
770
+ }
771
+ }
772
+
773
+ const (
774
+ // reportConnectionBufferLimit limits the number of connection reports we
775
+ // buffer to avoid growing the buffer indefinitely. This should not happen
776
+ // unless the agent has lost connection to coderd for a long time or if
777
+ // the agent is being spammed with connections.
778
+ //
779
+ // If we assume ~150 byte per connection report, this would be around 300KB
780
+ // of memory which seems acceptable. We could reduce this if necessary by
781
+ // not using the proto struct directly.
782
+ reportConnectionBufferLimit = 2048
783
+ )
784
+
785
+ func (a * agent ) reportConnection (id uuid.UUID , connectionType proto.Connection_Type , ip string ) (disconnected func (code int , reason string )) {
786
+ // If the experiment hasn't been enabled, we don't report connections.
787
+ if ! a .experimentalConnectionReports {
788
+ return func (int , string ) {} // Noop.
789
+ }
790
+
791
+ // Remove the port from the IP because ports are not supported in coderd.
792
+ if host , _ , err := net .SplitHostPort (ip ); err != nil {
793
+ a .logger .Error (a .hardCtx , "split host and port for connection report failed" , slog .F ("ip" , ip ), slog .Error (err ))
794
+ } else {
795
+ // Best effort.
796
+ ip = host
797
+ }
798
+
799
+ a .reportConnectionsMu .Lock ()
800
+ defer a .reportConnectionsMu .Unlock ()
801
+
802
+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
803
+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping connect" ,
804
+ slog .F ("limit" , reportConnectionBufferLimit ),
805
+ slog .F ("connection_id" , id ),
806
+ slog .F ("connection_type" , connectionType ),
807
+ slog .F ("ip" , ip ),
808
+ )
809
+ } else {
810
+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
811
+ Connection : & proto.Connection {
812
+ Id : id [:],
813
+ Action : proto .Connection_CONNECT ,
814
+ Type : connectionType ,
815
+ Timestamp : timestamppb .New (time .Now ()),
816
+ Ip : ip ,
817
+ StatusCode : 0 ,
818
+ Reason : nil ,
819
+ },
820
+ })
821
+ select {
822
+ case a .reportConnectionsUpdate <- struct {}{}:
823
+ default :
824
+ }
825
+ }
826
+
827
+ return func (code int , reason string ) {
828
+ a .reportConnectionsMu .Lock ()
829
+ defer a .reportConnectionsMu .Unlock ()
830
+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
831
+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping disconnect" ,
832
+ slog .F ("limit" , reportConnectionBufferLimit ),
833
+ slog .F ("connection_id" , id ),
834
+ slog .F ("connection_type" , connectionType ),
835
+ slog .F ("ip" , ip ),
836
+ )
837
+ return
838
+ }
839
+
840
+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
841
+ Connection : & proto.Connection {
842
+ Id : id [:],
843
+ Action : proto .Connection_DISCONNECT ,
844
+ Type : connectionType ,
845
+ Timestamp : timestamppb .New (time .Now ()),
846
+ Ip : ip ,
847
+ StatusCode : int32 (code ), //nolint:gosec
848
+ Reason : & reason ,
849
+ },
850
+ })
851
+ select {
852
+ case a .reportConnectionsUpdate <- struct {}{}:
853
+ default :
854
+ }
855
+ }
856
+ }
857
+
707
858
// fetchServiceBannerLoop fetches the service banner on an interval. It will
708
859
// not be fetched immediately; the expectation is that it is primed elsewhere
709
860
// (and must be done before the session actually starts).
@@ -814,6 +965,10 @@ func (a *agent) run() (retErr error) {
814
965
return resourcesmonitor .Start (ctx )
815
966
})
816
967
968
+ // Connection reports are part of auditing, we should keep sending them via
969
+ // gracefulShutdownBehaviorRemain.
970
+ connMan .startAgentAPI ("report connections" , gracefulShutdownBehaviorRemain , a .reportConnectionsLoop )
971
+
817
972
// channels to sync goroutines below
818
973
// handle manifest
819
974
// |
0 commit comments