Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 22e3ff9

Browse files
mafredrimtojek
andauthored
feat(agent): Add shutdown lifecycle states and shutdown_script support (#6139)
* feat(api): Add agent shutdown lifecycle states * feat(agent): Add shutdown_script support * feat(agent): Add shutdown_script timeout * feat(site): Support new agent lifecycle states --- Co-authored-by: Marcin Tojek <[email protected]>
1 parent 02100c6 commit 22e3ff9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1439
-635
lines changed

agent/agent.go

Lines changed: 108 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ func New(options Options) io.Closer {
121121
logDir: options.LogDir,
122122
tempDir: options.TempDir,
123123
lifecycleUpdate: make(chan struct{}, 1),
124+
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
124125
connStatsChan: make(chan *agentsdk.Stats, 1),
125126
}
126127
a.init(ctx)
@@ -149,9 +150,10 @@ type agent struct {
149150
sessionToken atomic.Pointer[string]
150151
sshServer *ssh.Server
151152

152-
lifecycleUpdate chan struct{}
153-
lifecycleMu sync.Mutex // Protects following.
154-
lifecycleState codersdk.WorkspaceAgentLifecycle
153+
lifecycleUpdate chan struct{}
154+
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
155+
lifecycleMu sync.RWMutex // Protects following.
156+
lifecycleState codersdk.WorkspaceAgentLifecycle
155157

156158
network *tailnet.Conn
157159
connStatsChan chan *agentsdk.Stats
@@ -207,9 +209,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
207209
}
208210

209211
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
210-
a.lifecycleMu.Lock()
212+
a.lifecycleMu.RLock()
211213
state := a.lifecycleState
212-
a.lifecycleMu.Unlock()
214+
a.lifecycleMu.RUnlock()
213215

214216
if state == lastReported {
215217
break
@@ -222,6 +224,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
222224
})
223225
if err == nil {
224226
lastReported = state
227+
select {
228+
case a.lifecycleReported <- state:
229+
case <-a.lifecycleReported:
230+
a.lifecycleReported <- state
231+
}
225232
break
226233
}
227234
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
@@ -233,13 +240,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
233240
}
234241
}
235242

243+
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
244+
// The state is only updated if it's a valid state transition.
236245
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
237246
a.lifecycleMu.Lock()
238-
defer a.lifecycleMu.Unlock()
239-
240-
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState))
241-
247+
lastState := a.lifecycleState
248+
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
249+
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
250+
a.lifecycleMu.Unlock()
251+
return
252+
}
242253
a.lifecycleState = state
254+
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
255+
a.lifecycleMu.Unlock()
256+
243257
select {
244258
case a.lifecycleUpdate <- struct{}{}:
245259
default:
@@ -299,9 +313,10 @@ func (a *agent) run(ctx context.Context) error {
299313
}
300314
}
301315

316+
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
302317
scriptDone := make(chan error, 1)
303318
scriptStart := time.Now()
304-
err := a.trackConnGoroutine(func() {
319+
err = a.trackConnGoroutine(func() {
305320
defer close(scriptDone)
306321
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript)
307322
})
@@ -329,16 +344,17 @@ func (a *agent) run(ctx context.Context) error {
329344
if errors.Is(err, context.Canceled) {
330345
return
331346
}
332-
execTime := time.Since(scriptStart)
333-
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady
334-
if err != nil {
335-
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
336-
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError
337-
} else {
338-
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
347+
// Only log if there was a startup script.
348+
if metadata.StartupScript != "" {
349+
execTime := time.Since(scriptStart)
350+
if err != nil {
351+
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
352+
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
353+
} else {
354+
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
355+
}
339356
}
340-
341-
a.setLifecycle(ctx, lifecycleStatus)
357+
a.setLifecycle(ctx, lifecycleState)
342358
}()
343359
}
344360

@@ -606,14 +622,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
606622
}
607623

608624
func (a *agent) runStartupScript(ctx context.Context, script string) error {
625+
return a.runScript(ctx, "startup", script)
626+
}
627+
628+
func (a *agent) runShutdownScript(ctx context.Context, script string) error {
629+
return a.runScript(ctx, "shutdown", script)
630+
}
631+
632+
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
609633
if script == "" {
610634
return nil
611635
}
612636

613-
a.logger.Info(ctx, "running startup script", slog.F("script", script))
614-
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
637+
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
638+
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
615639
if err != nil {
616-
return xerrors.Errorf("open startup script log file: %w", err)
640+
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
617641
}
618642
defer func() {
619643
_ = writer.Close()
@@ -774,7 +798,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
774798

775799
rawMetadata := a.metadata.Load()
776800
if rawMetadata == nil {
777-
return nil, xerrors.Errorf("no metadata was provided: %w", err)
801+
return nil, xerrors.Errorf("no metadata was provided")
778802
}
779803
metadata, valid := rawMetadata.(agentsdk.Metadata)
780804
if !valid {
@@ -1290,13 +1314,73 @@ func (a *agent) Close() error {
12901314
if a.isClosed() {
12911315
return nil
12921316
}
1317+
1318+
ctx := context.Background()
1319+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShuttingDown)
1320+
1321+
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1322+
if metadata, ok := a.metadata.Load().(agentsdk.Metadata); ok && metadata.ShutdownScript != "" {
1323+
scriptDone := make(chan error, 1)
1324+
scriptStart := time.Now()
1325+
go func() {
1326+
defer close(scriptDone)
1327+
scriptDone <- a.runShutdownScript(ctx, metadata.ShutdownScript)
1328+
}()
1329+
1330+
var timeout <-chan time.Time
1331+
// If timeout is zero, an older version of the coder
1332+
// provider was used. Otherwise a timeout is always > 0.
1333+
if metadata.ShutdownScriptTimeout > 0 {
1334+
t := time.NewTimer(metadata.ShutdownScriptTimeout)
1335+
defer t.Stop()
1336+
timeout = t.C
1337+
}
1338+
1339+
var err error
1340+
select {
1341+
case err = <-scriptDone:
1342+
case <-timeout:
1343+
a.logger.Warn(ctx, "shutdown script timed out")
1344+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
1345+
err = <-scriptDone // The script can still complete after a timeout.
1346+
}
1347+
execTime := time.Since(scriptStart)
1348+
if err != nil {
1349+
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
1350+
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
1351+
} else {
1352+
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
1353+
}
1354+
}
1355+
1356+
// Set final state and wait for it to be reported because context
1357+
// cancellation will stop the report loop.
1358+
a.setLifecycle(ctx, lifecycleState)
1359+
1360+
// Wait for the lifecycle to be reported, but don't wait forever so
1361+
// that we don't break user expectations.
1362+
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
1363+
defer cancel()
1364+
lifecycleWaitLoop:
1365+
for {
1366+
select {
1367+
case <-ctx.Done():
1368+
break lifecycleWaitLoop
1369+
case s := <-a.lifecycleReported:
1370+
if s == lifecycleState {
1371+
break lifecycleWaitLoop
1372+
}
1373+
}
1374+
}
1375+
12931376
close(a.closed)
12941377
a.closeCancel()
1378+
_ = a.sshServer.Close()
12951379
if a.network != nil {
12961380
_ = a.network.Close()
12971381
}
1298-
_ = a.sshServer.Close()
12991382
a.connCloseWait.Wait()
1383+
13001384
return nil
13011385
}
13021386

0 commit comments

Comments
 (0)