Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 138887d

Browse files
authored
feat: Add workspace agent lifecycle state reporting (#5785)
1 parent dbfeb56 commit 138887d

34 files changed

+1595
-633
lines changed

agent/agent.go

+107-9
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ type Client interface {
7171
WorkspaceAgentMetadata(ctx context.Context) (codersdk.WorkspaceAgentMetadata, error)
7272
ListenWorkspaceAgent(ctx context.Context) (net.Conn, error)
7373
AgentReportStats(ctx context.Context, log slog.Logger, stats func() *codersdk.AgentStats) (io.Closer, error)
74+
PostWorkspaceAgentLifecycle(ctx context.Context, state codersdk.PostWorkspaceAgentLifecycleRequest) error
7475
PostWorkspaceAgentAppHealth(ctx context.Context, req codersdk.PostWorkspaceAppHealthsRequest) error
7576
PostWorkspaceAgentVersion(ctx context.Context, version string) error
7677
}
@@ -101,6 +102,7 @@ func New(options Options) io.Closer {
101102
exchangeToken: options.ExchangeToken,
102103
filesystem: options.Filesystem,
103104
tempDir: options.TempDir,
105+
lifecycleUpdate: make(chan struct{}, 1),
104106
}
105107
a.init(ctx)
106108
return a
@@ -127,6 +129,10 @@ type agent struct {
127129
sessionToken atomic.Pointer[string]
128130
sshServer *ssh.Server
129131

132+
lifecycleUpdate chan struct{}
133+
lifecycleMu sync.Mutex // Protects following.
134+
lifecycleState codersdk.WorkspaceAgentLifecycle
135+
130136
network *tailnet.Conn
131137
}
132138

@@ -135,6 +141,8 @@ type agent struct {
135141
// may be happening, but regardless after the intermittent
136142
// failure, you'll want the agent to reconnect.
137143
func (a *agent) runLoop(ctx context.Context) {
144+
go a.reportLifecycleLoop(ctx)
145+
138146
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
139147
a.logger.Info(ctx, "running loop")
140148
err := a.run(ctx)
@@ -156,6 +164,58 @@ func (a *agent) runLoop(ctx context.Context) {
156164
}
157165
}
158166

167+
// reportLifecycleLoop reports the current lifecycle state once.
168+
// Only the latest state is reported, intermediate states may be
169+
// lost if the agent can't communicate with the API.
170+
func (a *agent) reportLifecycleLoop(ctx context.Context) {
171+
var lastReported codersdk.WorkspaceAgentLifecycle
172+
for {
173+
select {
174+
case <-a.lifecycleUpdate:
175+
case <-ctx.Done():
176+
return
177+
}
178+
179+
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
180+
a.lifecycleMu.Lock()
181+
state := a.lifecycleState
182+
a.lifecycleMu.Unlock()
183+
184+
if state == lastReported {
185+
break
186+
}
187+
188+
a.logger.Debug(ctx, "post lifecycle state", slog.F("state", state))
189+
190+
err := a.client.PostWorkspaceAgentLifecycle(ctx, codersdk.PostWorkspaceAgentLifecycleRequest{
191+
State: state,
192+
})
193+
if err == nil {
194+
lastReported = state
195+
break
196+
}
197+
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
198+
return
199+
}
200+
// If we fail to report the state we probably shouldn't exit, log only.
201+
a.logger.Error(ctx, "post state", slog.Error(err))
202+
}
203+
}
204+
}
205+
206+
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
207+
a.lifecycleMu.Lock()
208+
defer a.lifecycleMu.Unlock()
209+
210+
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState))
211+
212+
a.lifecycleState = state
213+
select {
214+
case a.lifecycleUpdate <- struct{}{}:
215+
default:
216+
}
217+
}
218+
159219
func (a *agent) run(ctx context.Context) error {
160220
// This allows the agent to refresh it's token if necessary.
161221
// For instance identity this is required, since the instance
@@ -180,22 +240,60 @@ func (a *agent) run(ctx context.Context) error {
180240

181241
// The startup script should only execute on the first run!
182242
if oldMetadata == nil {
243+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStarting)
244+
245+
// Perform overrides early so that Git auth can work even if users
246+
// connect to a workspace that is not yet ready. We don't run this
247+
// concurrently with the startup script to avoid conflicts between
248+
// them.
249+
if metadata.GitAuthConfigs > 0 {
250+
// If this fails, we should consider surfacing the error in the
251+
// startup log and setting the lifecycle state to be "start_error"
252+
// (after startup script completion), but for now we'll just log it.
253+
err := gitauth.OverrideVSCodeConfigs(a.filesystem)
254+
if err != nil {
255+
a.logger.Warn(ctx, "failed to override vscode git auth configs", slog.Error(err))
256+
}
257+
}
258+
259+
scriptDone := make(chan error, 1)
260+
scriptStart := time.Now()
261+
go func() {
262+
defer close(scriptDone)
263+
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript)
264+
}()
183265
go func() {
184-
err := a.runStartupScript(ctx, metadata.StartupScript)
266+
var timeout <-chan time.Time
267+
// If timeout is zero, an older version of the coder
268+
// provider was used. Otherwise a timeout is always > 0.
269+
if metadata.StartupScriptTimeout > 0 {
270+
t := time.NewTimer(metadata.StartupScriptTimeout)
271+
defer t.Stop()
272+
timeout = t.C
273+
}
274+
275+
var err error
276+
select {
277+
case err = <-scriptDone:
278+
case <-timeout:
279+
a.logger.Warn(ctx, "startup script timed out")
280+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
281+
err = <-scriptDone // The script can still complete after a timeout.
282+
}
185283
if errors.Is(err, context.Canceled) {
186284
return
187285
}
286+
execTime := time.Since(scriptStart)
287+
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady
188288
if err != nil {
189-
a.logger.Warn(ctx, "agent script failed", slog.Error(err))
289+
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
290+
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError
291+
} else {
292+
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
190293
}
191-
}()
192-
}
193294

194-
if metadata.GitAuthConfigs > 0 {
195-
err = gitauth.OverrideVSCodeConfigs(a.filesystem)
196-
if err != nil {
197-
return xerrors.Errorf("override vscode configuration for git auth: %w", err)
198-
}
295+
a.setLifecycle(ctx, lifecycleStatus)
296+
}()
199297
}
200298

201299
// This automatically closes when the context ends!

0 commit comments

Comments
 (0)