8
8
"fmt"
9
9
"hash/fnv"
10
10
"io"
11
+ "net"
11
12
"net/http"
12
13
"net/netip"
13
14
"os"
@@ -28,6 +29,7 @@ import (
28
29
"golang.org/x/exp/slices"
29
30
"golang.org/x/sync/errgroup"
30
31
"golang.org/x/xerrors"
32
+ "google.golang.org/protobuf/types/known/timestamppb"
31
33
"tailscale.com/net/speedtest"
32
34
"tailscale.com/tailcfg"
33
35
"tailscale.com/types/netlogtype"
@@ -88,6 +90,8 @@ type Options struct {
88
90
BlockFileTransfer bool
89
91
Execer agentexec.Execer
90
92
ContainerLister agentcontainers.Lister
93
+
94
+ ExperimentalConnectionReports bool
91
95
}
92
96
93
97
type Client interface {
@@ -175,6 +179,7 @@ func New(options Options) Agent {
175
179
lifecycleUpdate : make (chan struct {}, 1 ),
176
180
lifecycleReported : make (chan codersdk.WorkspaceAgentLifecycle , 1 ),
177
181
lifecycleStates : []agentsdk.PostLifecycleRequest {{State : codersdk .WorkspaceAgentLifecycleCreated }},
182
+ reportConnectionsUpdate : make (chan struct {}, 1 ),
178
183
ignorePorts : options .IgnorePorts ,
179
184
portCacheDuration : options .PortCacheDuration ,
180
185
reportMetadataInterval : options .ReportMetadataInterval ,
@@ -188,6 +193,8 @@ func New(options Options) Agent {
188
193
metrics : newAgentMetrics (prometheusRegistry ),
189
194
execer : options .Execer ,
190
195
lister : options .ContainerLister ,
196
+
197
+ experimentalConnectionReports : options .ExperimentalConnectionReports ,
191
198
}
192
199
// Initially, we have a closed channel, reflecting the fact that we are not initially connected.
193
200
// Each time we connect we replace the channel (while holding the closeMutex) with a new one
@@ -248,6 +255,10 @@ type agent struct {
248
255
lifecycleStates []agentsdk.PostLifecycleRequest
249
256
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.
250
257
258
+ reportConnectionsUpdate chan struct {}
259
+ reportConnectionsMu sync.Mutex
260
+ reportConnections []* proto.ReportConnectionRequest
261
+
251
262
network * tailnet.Conn
252
263
statsReporter * statsReporter
253
264
logSender * agentsdk.LogSender
@@ -258,6 +269,8 @@ type agent struct {
258
269
metrics * agentMetrics
259
270
execer agentexec.Execer
260
271
lister agentcontainers.Lister
272
+
273
+ experimentalConnectionReports bool
261
274
}
262
275
263
276
func (a * agent ) TailnetConn () * tailnet.Conn {
@@ -273,6 +286,24 @@ func (a *agent) init() {
273
286
UpdateEnv : a .updateCommandEnv ,
274
287
WorkingDirectory : func () string { return a .manifest .Load ().Directory },
275
288
BlockFileTransfer : a .blockFileTransfer ,
289
+ ReportConnection : func (id uuid.UUID , magicType agentssh.MagicSessionType , ip string ) func (code int , reason string ) {
290
+ var connectionType proto.Connection_Type
291
+ switch magicType {
292
+ case agentssh .MagicSessionTypeSSH :
293
+ connectionType = proto .Connection_SSH
294
+ case agentssh .MagicSessionTypeVSCode :
295
+ connectionType = proto .Connection_VSCODE
296
+ case agentssh .MagicSessionTypeJetBrains :
297
+ connectionType = proto .Connection_JETBRAINS
298
+ case agentssh .MagicSessionTypeUnknown :
299
+ connectionType = proto .Connection_TYPE_UNSPECIFIED
300
+ default :
301
+ a .logger .Error (a .hardCtx , "unhandled magic session type when reporting connection" , slog .F ("magic_type" , magicType ))
302
+ connectionType = proto .Connection_TYPE_UNSPECIFIED
303
+ }
304
+
305
+ return a .reportConnection (id , connectionType , ip )
306
+ },
276
307
})
277
308
if err != nil {
278
309
panic (err )
@@ -295,6 +326,9 @@ func (a *agent) init() {
295
326
a .reconnectingPTYServer = reconnectingpty .NewServer (
296
327
a .logger .Named ("reconnecting-pty" ),
297
328
a .sshServer ,
329
+ func (id uuid.UUID , ip string ) func (code int , reason string ) {
330
+ return a .reportConnection (id , proto .Connection_RECONNECTING_PTY , ip )
331
+ },
298
332
a .metrics .connectionsTotal , a .metrics .reconnectingPTYErrors ,
299
333
a .reconnectingPTYTimeout ,
300
334
)
@@ -704,6 +738,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704
738
}
705
739
}
706
740
741
+ // reportConnectionsLoop reports connections to the agent for auditing.
742
+ func (a * agent ) reportConnectionsLoop (ctx context.Context , aAPI proto.DRPCAgentClient24 ) error {
743
+ for {
744
+ select {
745
+ case <- a .reportConnectionsUpdate :
746
+ case <- ctx .Done ():
747
+ return ctx .Err ()
748
+ }
749
+
750
+ for {
751
+ a .reportConnectionsMu .Lock ()
752
+ if len (a .reportConnections ) == 0 {
753
+ a .reportConnectionsMu .Unlock ()
754
+ break
755
+ }
756
+ payload := a .reportConnections [0 ]
757
+ // Release lock while we send the payload, this is safe
758
+ // since we only append to the slice.
759
+ a .reportConnectionsMu .Unlock ()
760
+
761
+ logger := a .logger .With (slog .F ("payload" , payload ))
762
+ logger .Debug (ctx , "reporting connection" )
763
+ _ , err := aAPI .ReportConnection (ctx , payload )
764
+ if err != nil {
765
+ return xerrors .Errorf ("failed to report connection: %w" , err )
766
+ }
767
+
768
+ logger .Debug (ctx , "successfully reported connection" )
769
+
770
+ // Remove the payload we sent.
771
+ a .reportConnectionsMu .Lock ()
772
+ a .reportConnections [0 ] = nil // Release the pointer from the underlying array.
773
+ a .reportConnections = a .reportConnections [1 :]
774
+ a .reportConnectionsMu .Unlock ()
775
+ }
776
+ }
777
+ }
778
+
779
+ const (
780
+ // reportConnectionBufferLimit limits the number of connection reports we
781
+ // buffer to avoid growing the buffer indefinitely. This should not happen
782
+ // unless the agent has lost connection to coderd for a long time or if
783
+ // the agent is being spammed with connections.
784
+ //
785
+ // If we assume ~150 byte per connection report, this would be around 300KB
786
+ // of memory which seems acceptable. We could reduce this if necessary by
787
+ // not using the proto struct directly.
788
+ reportConnectionBufferLimit = 2048
789
+ )
790
+
791
+ func (a * agent ) reportConnection (id uuid.UUID , connectionType proto.Connection_Type , ip string ) (disconnected func (code int , reason string )) {
792
+ // If the experiment hasn't been enabled, we don't report connections.
793
+ if ! a .experimentalConnectionReports {
794
+ return func (int , string ) {} // Noop.
795
+ }
796
+
797
+ // Remove the port from the IP because ports are not supported in coderd.
798
+ if host , _ , err := net .SplitHostPort (ip ); err != nil {
799
+ a .logger .Error (a .hardCtx , "split host and port for connection report failed" , slog .F ("ip" , ip ), slog .Error (err ))
800
+ } else {
801
+ // Best effort.
802
+ ip = host
803
+ }
804
+
805
+ a .reportConnectionsMu .Lock ()
806
+ defer a .reportConnectionsMu .Unlock ()
807
+
808
+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
809
+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping connect" ,
810
+ slog .F ("limit" , reportConnectionBufferLimit ),
811
+ slog .F ("connection_id" , id ),
812
+ slog .F ("connection_type" , connectionType ),
813
+ slog .F ("ip" , ip ),
814
+ )
815
+ } else {
816
+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
817
+ Connection : & proto.Connection {
818
+ Id : id [:],
819
+ Action : proto .Connection_CONNECT ,
820
+ Type : connectionType ,
821
+ Timestamp : timestamppb .New (time .Now ()),
822
+ Ip : ip ,
823
+ StatusCode : 0 ,
824
+ Reason : nil ,
825
+ },
826
+ })
827
+ select {
828
+ case a .reportConnectionsUpdate <- struct {}{}:
829
+ default :
830
+ }
831
+ }
832
+
833
+ return func (code int , reason string ) {
834
+ a .reportConnectionsMu .Lock ()
835
+ defer a .reportConnectionsMu .Unlock ()
836
+ if len (a .reportConnections ) >= reportConnectionBufferLimit {
837
+ a .logger .Warn (a .hardCtx , "connection report buffer limit reached, dropping disconnect" ,
838
+ slog .F ("limit" , reportConnectionBufferLimit ),
839
+ slog .F ("connection_id" , id ),
840
+ slog .F ("connection_type" , connectionType ),
841
+ slog .F ("ip" , ip ),
842
+ )
843
+ return
844
+ }
845
+
846
+ a .reportConnections = append (a .reportConnections , & proto.ReportConnectionRequest {
847
+ Connection : & proto.Connection {
848
+ Id : id [:],
849
+ Action : proto .Connection_DISCONNECT ,
850
+ Type : connectionType ,
851
+ Timestamp : timestamppb .New (time .Now ()),
852
+ Ip : ip ,
853
+ StatusCode : int32 (code ), //nolint:gosec
854
+ Reason : & reason ,
855
+ },
856
+ })
857
+ select {
858
+ case a .reportConnectionsUpdate <- struct {}{}:
859
+ default :
860
+ }
861
+ }
862
+ }
863
+
707
864
// fetchServiceBannerLoop fetches the service banner on an interval. It will
708
865
// not be fetched immediately; the expectation is that it is primed elsewhere
709
866
// (and must be done before the session actually starts).
@@ -814,6 +971,10 @@ func (a *agent) run() (retErr error) {
814
971
return resourcesmonitor .Start (ctx )
815
972
})
816
973
974
+ // Connection reports are part of auditing, we should keep sending them via
975
+ // gracefulShutdownBehaviorRemain.
976
+ connMan .startAgentAPI ("report connections" , gracefulShutdownBehaviorRemain , a .reportConnectionsLoop )
977
+
817
978
// channels to sync goroutines below
818
979
// handle manifest
819
980
// |
0 commit comments