@@ -71,6 +71,7 @@ type Client interface {
71
71
WorkspaceAgentMetadata (ctx context.Context ) (codersdk.WorkspaceAgentMetadata , error )
72
72
ListenWorkspaceAgent (ctx context.Context ) (net.Conn , error )
73
73
AgentReportStats (ctx context.Context , log slog.Logger , stats func () * codersdk.AgentStats ) (io.Closer , error )
74
+ PostWorkspaceAgentLifecycle (ctx context.Context , state codersdk.PostWorkspaceAgentLifecycleRequest ) error
74
75
PostWorkspaceAgentAppHealth (ctx context.Context , req codersdk.PostWorkspaceAppHealthsRequest ) error
75
76
PostWorkspaceAgentVersion (ctx context.Context , version string ) error
76
77
}
@@ -101,6 +102,7 @@ func New(options Options) io.Closer {
101
102
exchangeToken : options .ExchangeToken ,
102
103
filesystem : options .Filesystem ,
103
104
tempDir : options .TempDir ,
105
+ lifecycleUpdate : make (chan struct {}, 1 ),
104
106
}
105
107
a .init (ctx )
106
108
return a
@@ -127,6 +129,10 @@ type agent struct {
127
129
sessionToken atomic.Pointer [string ]
128
130
sshServer * ssh.Server
129
131
132
+ lifecycleUpdate chan struct {}
133
+ lifecycleMu sync.Mutex // Protects following.
134
+ lifecycleState codersdk.WorkspaceAgentLifecycle
135
+
130
136
network * tailnet.Conn
131
137
}
132
138
@@ -135,6 +141,8 @@ type agent struct {
135
141
// may be happening, but regardless after the intermittent
136
142
// failure, you'll want the agent to reconnect.
137
143
func (a * agent ) runLoop (ctx context.Context ) {
144
+ go a .reportLifecycleLoop (ctx )
145
+
138
146
for retrier := retry .New (100 * time .Millisecond , 10 * time .Second ); retrier .Wait (ctx ); {
139
147
a .logger .Info (ctx , "running loop" )
140
148
err := a .run (ctx )
@@ -156,6 +164,58 @@ func (a *agent) runLoop(ctx context.Context) {
156
164
}
157
165
}
158
166
167
+ // reportLifecycleLoop reports the current lifecycle state once.
168
+ // Only the latest state is reported, intermediate states may be
169
+ // lost if the agent can't communicate with the API.
170
+ func (a * agent ) reportLifecycleLoop (ctx context.Context ) {
171
+ var lastReported codersdk.WorkspaceAgentLifecycle
172
+ for {
173
+ select {
174
+ case <- a .lifecycleUpdate :
175
+ case <- ctx .Done ():
176
+ return
177
+ }
178
+
179
+ for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
180
+ a .lifecycleMu .Lock ()
181
+ state := a .lifecycleState
182
+ a .lifecycleMu .Unlock ()
183
+
184
+ if state == lastReported {
185
+ break
186
+ }
187
+
188
+ a .logger .Debug (ctx , "post lifecycle state" , slog .F ("state" , state ))
189
+
190
+ err := a .client .PostWorkspaceAgentLifecycle (ctx , codersdk.PostWorkspaceAgentLifecycleRequest {
191
+ State : state ,
192
+ })
193
+ if err == nil {
194
+ lastReported = state
195
+ break
196
+ }
197
+ if xerrors .Is (err , context .Canceled ) || xerrors .Is (err , context .DeadlineExceeded ) {
198
+ return
199
+ }
200
+ // If we fail to report the state we probably shouldn't exit, log only.
201
+ a .logger .Error (ctx , "post state" , slog .Error (err ))
202
+ }
203
+ }
204
+ }
205
+
206
+ func (a * agent ) setLifecycle (ctx context.Context , state codersdk.WorkspaceAgentLifecycle ) {
207
+ a .lifecycleMu .Lock ()
208
+ defer a .lifecycleMu .Unlock ()
209
+
210
+ a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("previous" , a .lifecycleState ))
211
+
212
+ a .lifecycleState = state
213
+ select {
214
+ case a .lifecycleUpdate <- struct {}{}:
215
+ default :
216
+ }
217
+ }
218
+
159
219
func (a * agent ) run (ctx context.Context ) error {
160
220
// This allows the agent to refresh it's token if necessary.
161
221
// For instance identity this is required, since the instance
@@ -180,22 +240,60 @@ func (a *agent) run(ctx context.Context) error {
180
240
181
241
// The startup script should only execute on the first run!
182
242
if oldMetadata == nil {
243
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleStarting )
244
+
245
+ // Perform overrides early so that Git auth can work even if users
246
+ // connect to a workspace that is not yet ready. We don't run this
247
+ // concurrently with the startup script to avoid conflicts between
248
+ // them.
249
+ if metadata .GitAuthConfigs > 0 {
250
+ // If this fails, we should consider surfacing the error in the
251
+ // startup log and setting the lifecycle state to be "start_error"
252
+ // (after startup script completion), but for now we'll just log it.
253
+ err := gitauth .OverrideVSCodeConfigs (a .filesystem )
254
+ if err != nil {
255
+ a .logger .Warn (ctx , "failed to override vscode git auth configs" , slog .Error (err ))
256
+ }
257
+ }
258
+
259
+ scriptDone := make (chan error , 1 )
260
+ scriptStart := time .Now ()
261
+ go func () {
262
+ defer close (scriptDone )
263
+ scriptDone <- a .runStartupScript (ctx , metadata .StartupScript )
264
+ }()
183
265
go func () {
184
- err := a .runStartupScript (ctx , metadata .StartupScript )
266
+ var timeout <- chan time.Time
267
+ // If timeout is zero, an older version of the coder
268
+ // provider was used. Otherwise a timeout is always > 0.
269
+ if metadata .StartupScriptTimeout > 0 {
270
+ t := time .NewTimer (metadata .StartupScriptTimeout )
271
+ defer t .Stop ()
272
+ timeout = t .C
273
+ }
274
+
275
+ var err error
276
+ select {
277
+ case err = <- scriptDone :
278
+ case <- timeout :
279
+ a .logger .Warn (ctx , "startup script timed out" )
280
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleStartTimeout )
281
+ err = <- scriptDone // The script can still complete after a timeout.
282
+ }
185
283
if errors .Is (err , context .Canceled ) {
186
284
return
187
285
}
286
+ execTime := time .Since (scriptStart )
287
+ lifecycleStatus := codersdk .WorkspaceAgentLifecycleReady
188
288
if err != nil {
189
- a .logger .Warn (ctx , "agent script failed" , slog .Error (err ))
289
+ a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
290
+ lifecycleStatus = codersdk .WorkspaceAgentLifecycleStartError
291
+ } else {
292
+ a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
190
293
}
191
- }()
192
- }
193
294
194
- if metadata .GitAuthConfigs > 0 {
195
- err = gitauth .OverrideVSCodeConfigs (a .filesystem )
196
- if err != nil {
197
- return xerrors .Errorf ("override vscode configuration for git auth: %w" , err )
198
- }
295
+ a .setLifecycle (ctx , lifecycleStatus )
296
+ }()
199
297
}
200
298
201
299
// This automatically closes when the context ends!
0 commit comments