@@ -122,6 +122,7 @@ func New(options Options) io.Closer {
122
122
logDir : options .LogDir ,
123
123
tempDir : options .TempDir ,
124
124
lifecycleUpdate : make (chan struct {}, 1 ),
125
+ lifecycleReported : make (chan codersdk.WorkspaceAgentLifecycle , 1 ),
125
126
connStatsChan : make (chan * agentsdk.Stats , 1 ),
126
127
}
127
128
a .init (ctx )
@@ -150,9 +151,10 @@ type agent struct {
150
151
sessionToken atomic.Pointer [string ]
151
152
sshServer * ssh.Server
152
153
153
- lifecycleUpdate chan struct {}
154
- lifecycleMu sync.Mutex // Protects following.
155
- lifecycleState codersdk.WorkspaceAgentLifecycle
154
+ lifecycleUpdate chan struct {}
155
+ lifecycleReported chan codersdk.WorkspaceAgentLifecycle
156
+ lifecycleMu sync.RWMutex // Protects following.
157
+ lifecycleState codersdk.WorkspaceAgentLifecycle
156
158
157
159
network * tailnet.Conn
158
160
connStatsChan chan * agentsdk.Stats
@@ -205,9 +207,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
205
207
}
206
208
207
209
for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
208
- a .lifecycleMu .Lock ()
210
+ a .lifecycleMu .RLock ()
209
211
state := a .lifecycleState
210
- a .lifecycleMu .Unlock ()
212
+ a .lifecycleMu .RUnlock ()
211
213
212
214
if state == lastReported {
213
215
break
@@ -220,6 +222,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
220
222
})
221
223
if err == nil {
222
224
lastReported = state
225
+ select {
226
+ case a .lifecycleReported <- state :
227
+ case <- a .lifecycleReported :
228
+ a .lifecycleReported <- state
229
+ }
223
230
break
224
231
}
225
232
if xerrors .Is (err , context .Canceled ) || xerrors .Is (err , context .DeadlineExceeded ) {
@@ -231,13 +238,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
231
238
}
232
239
}
233
240
241
+ // setLifecycle sets the lifecycle state and notifies the lifecycle loop.
242
+ // The state is only updated if it's a valid state transition.
234
243
func (a * agent ) setLifecycle (ctx context.Context , state codersdk.WorkspaceAgentLifecycle ) {
235
244
a .lifecycleMu .Lock ()
236
- defer a .lifecycleMu .Unlock ()
237
-
238
- a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("previous" , a .lifecycleState ))
239
-
245
+ lastState := a .lifecycleState
246
+ if slices .Index (codersdk .WorkspaceAgentLifecycleOrder , lastState ) > slices .Index (codersdk .WorkspaceAgentLifecycleOrder , state ) {
247
+ a .logger .Warn (ctx , "attempted to set lifecycle state to a previous state" , slog .F ("last" , lastState ), slog .F ("state" , state ))
248
+ a .lifecycleMu .Unlock ()
249
+ return
250
+ }
240
251
a .lifecycleState = state
252
+ a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("last" , lastState ))
253
+ a .lifecycleMu .Unlock ()
254
+
241
255
select {
242
256
case a .lifecycleUpdate <- struct {}{}:
243
257
default :
@@ -297,9 +311,10 @@ func (a *agent) run(ctx context.Context) error {
297
311
}
298
312
}
299
313
314
+ lifecycleState := codersdk .WorkspaceAgentLifecycleReady
300
315
scriptDone := make (chan error , 1 )
301
316
scriptStart := time .Now ()
302
- err : = a .trackConnGoroutine (func () {
317
+ err = a .trackConnGoroutine (func () {
303
318
defer close (scriptDone )
304
319
scriptDone <- a .runStartupScript (ctx , metadata .StartupScript )
305
320
})
@@ -327,16 +342,17 @@ func (a *agent) run(ctx context.Context) error {
327
342
if errors .Is (err , context .Canceled ) {
328
343
return
329
344
}
330
- execTime := time .Since (scriptStart )
331
- lifecycleStatus := codersdk .WorkspaceAgentLifecycleReady
332
- if err != nil {
333
- a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
334
- lifecycleStatus = codersdk .WorkspaceAgentLifecycleStartError
335
- } else {
336
- a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
345
+ // Only log if there was a startup script.
346
+ if metadata .StartupScript != "" {
347
+ execTime := time .Since (scriptStart )
348
+ if err != nil {
349
+ a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
350
+ lifecycleState = codersdk .WorkspaceAgentLifecycleStartError
351
+ } else {
352
+ a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
353
+ }
337
354
}
338
-
339
- a .setLifecycle (ctx , lifecycleStatus )
355
+ a .setLifecycle (ctx , lifecycleState )
340
356
}()
341
357
}
342
358
@@ -604,14 +620,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
604
620
}
605
621
606
622
func (a * agent ) runStartupScript (ctx context.Context , script string ) error {
623
+ return a .runScript (ctx , "startup" , script )
624
+ }
625
+
626
+ func (a * agent ) runShutdownScript (ctx context.Context , script string ) error {
627
+ return a .runScript (ctx , "shutdown" , script )
628
+ }
629
+
630
+ func (a * agent ) runScript (ctx context.Context , lifecycle , script string ) error {
607
631
if script == "" {
608
632
return nil
609
633
}
610
634
611
- a .logger .Info (ctx , "running startup script" , slog .F ("script" , script ))
612
- writer , err := a .filesystem .OpenFile (filepath .Join (a .logDir , "coder-startup -script.log" ), os .O_CREATE | os .O_RDWR , 0o600 )
635
+ a .logger .Info (ctx , "running script" , slog . F ( "lifecycle" , lifecycle ) , slog .F ("script" , script ))
636
+ writer , err := a .filesystem .OpenFile (filepath .Join (a .logDir , fmt . Sprintf ( "coder-%s -script.log" , lifecycle ) ), os .O_CREATE | os .O_RDWR , 0o600 )
613
637
if err != nil {
614
- return xerrors .Errorf ("open startup script log file: %w" , err )
638
+ return xerrors .Errorf ("open %s script log file: %w" , lifecycle , err )
615
639
}
616
640
defer func () {
617
641
_ = writer .Close ()
@@ -772,7 +796,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
772
796
773
797
rawMetadata := a .metadata .Load ()
774
798
if rawMetadata == nil {
775
- return nil , xerrors .Errorf ("no metadata was provided: %w" , err )
799
+ return nil , xerrors .Errorf ("no metadata was provided" )
776
800
}
777
801
metadata , valid := rawMetadata .(agentsdk.Metadata )
778
802
if ! valid {
@@ -1296,13 +1320,73 @@ func (a *agent) Close() error {
1296
1320
if a .isClosed () {
1297
1321
return nil
1298
1322
}
1323
+
1324
+ ctx := context .Background ()
1325
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleShuttingDown )
1326
+
1327
+ lifecycleState := codersdk .WorkspaceAgentLifecycleOff
1328
+ if metadata , ok := a .metadata .Load ().(agentsdk.Metadata ); ok && metadata .ShutdownScript != "" {
1329
+ scriptDone := make (chan error , 1 )
1330
+ scriptStart := time .Now ()
1331
+ go func () {
1332
+ defer close (scriptDone )
1333
+ scriptDone <- a .runShutdownScript (ctx , metadata .ShutdownScript )
1334
+ }()
1335
+
1336
+ var timeout <- chan time.Time
1337
+ // If timeout is zero, an older version of the coder
1338
+ // provider was used. Otherwise a timeout is always > 0.
1339
+ if metadata .ShutdownScriptTimeout > 0 {
1340
+ t := time .NewTimer (metadata .ShutdownScriptTimeout )
1341
+ defer t .Stop ()
1342
+ timeout = t .C
1343
+ }
1344
+
1345
+ var err error
1346
+ select {
1347
+ case err = <- scriptDone :
1348
+ case <- timeout :
1349
+ a .logger .Warn (ctx , "shutdown script timed out" )
1350
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleShutdownTimeout )
1351
+ err = <- scriptDone // The script can still complete after a timeout.
1352
+ }
1353
+ execTime := time .Since (scriptStart )
1354
+ if err != nil {
1355
+ a .logger .Warn (ctx , "shutdown script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
1356
+ lifecycleState = codersdk .WorkspaceAgentLifecycleShutdownError
1357
+ } else {
1358
+ a .logger .Info (ctx , "shutdown script completed" , slog .F ("execution_time" , execTime ))
1359
+ }
1360
+ }
1361
+
1362
+ // Set final state and wait for it to be reported because context
1363
+ // cancellation will stop the report loop.
1364
+ a .setLifecycle (ctx , lifecycleState )
1365
+
1366
+ // Wait for the lifecycle to be reported, but don't wait forever so
1367
+ // that we don't break user expectations.
1368
+ ctx , cancel := context .WithTimeout (ctx , 5 * time .Second )
1369
+ defer cancel ()
1370
+ lifecycleWaitLoop:
1371
+ for {
1372
+ select {
1373
+ case <- ctx .Done ():
1374
+ break lifecycleWaitLoop
1375
+ case s := <- a .lifecycleReported :
1376
+ if s == lifecycleState {
1377
+ break lifecycleWaitLoop
1378
+ }
1379
+ }
1380
+ }
1381
+
1299
1382
close (a .closed )
1300
1383
a .closeCancel ()
1384
+ _ = a .sshServer .Close ()
1301
1385
if a .network != nil {
1302
1386
_ = a .network .Close ()
1303
1387
}
1304
- _ = a .sshServer .Close ()
1305
1388
a .connCloseWait .Wait ()
1389
+
1306
1390
return nil
1307
1391
}
1308
1392
0 commit comments