@@ -108,7 +108,7 @@ func New(options Options) Agent {
108
108
}
109
109
}
110
110
if options .ReportMetadataInterval == 0 {
111
- options .ReportMetadataInterval = 1 * time .Minute
111
+ options .ReportMetadataInterval = time .Second
112
112
}
113
113
if options .ServiceBannerRefreshInterval == 0 {
114
114
options .ServiceBannerRefreshInterval = 2 * time .Minute
@@ -242,15 +242,15 @@ func (a *agent) runLoop(ctx context.Context) {
242
242
}
243
243
}
244
244
245
- func (a * agent ) collectMetadata (ctx context.Context , md codersdk.WorkspaceAgentMetadataDescription ) * codersdk.WorkspaceAgentMetadataResult {
245
+ func (a * agent ) collectMetadata (ctx context.Context , md codersdk.WorkspaceAgentMetadataDescription , now time. Time ) * codersdk.WorkspaceAgentMetadataResult {
246
246
var out bytes.Buffer
247
247
result := & codersdk.WorkspaceAgentMetadataResult {
248
248
// CollectedAt is set here for testing purposes and overrode by
249
249
// coderd to the time of server receipt to solve clock skew.
250
250
//
251
251
// In the future, the server may accept the timestamp from the agent
252
252
// if it can guarantee the clocks are synchronized.
253
- CollectedAt : time . Now () ,
253
+ CollectedAt : now ,
254
254
}
255
255
cmdPty , err := a .sshServer .CreateCommand (ctx , md .Script , nil )
256
256
if err != nil {
@@ -298,54 +298,68 @@ type metadataResultAndKey struct {
298
298
}
299
299
300
300
type trySingleflight struct {
301
- m sync.Map
301
+ mu sync.Mutex
302
+ m map [string ]struct {}
302
303
}
303
304
304
305
func (t * trySingleflight ) Do (key string , fn func ()) {
305
- _ , loaded := t .m .LoadOrStore (key , struct {}{})
306
- if ! loaded {
307
- // There is already a goroutine running for this key.
306
+ t .mu .Lock ()
307
+ _ , ok := t .m [key ]
308
+ if ok {
309
+ t .mu .Unlock ()
308
310
return
309
311
}
310
312
311
- defer t .m .Delete (key )
313
+ t .m [key ] = struct {}{}
314
+ t .mu .Unlock ()
315
+ defer func () {
316
+ t .mu .Lock ()
317
+ delete (t .m , key )
318
+ t .mu .Unlock ()
319
+ }()
320
+
312
321
fn ()
313
322
}
314
323
315
324
func (a * agent ) reportMetadataLoop (ctx context.Context ) {
316
325
const metadataLimit = 128
317
326
318
327
var (
319
- baseTicker = time .NewTicker (a .reportMetadataInterval )
320
- lastCollectedAts = make (map [string ]time.Time )
321
- metadataResults = make (chan metadataResultAndKey , metadataLimit )
328
+ baseTicker = time .NewTicker (a .reportMetadataInterval )
329
+ lastCollectedAtMu sync.RWMutex
330
+ lastCollectedAts = make (map [string ]time.Time )
331
+ metadataResults = make (chan metadataResultAndKey , metadataLimit )
332
+ logger = a .logger .Named ("metadata" )
322
333
)
323
334
defer baseTicker .Stop ()
324
335
325
336
// We use a custom singleflight that immediately returns if there is already
326
337
// a goroutine running for a given key. This is to prevent a build-up of
327
338
// goroutines waiting on Do when the script takes many multiples of
328
339
// baseInterval to run.
329
- var flight trySingleflight
340
+ flight := trySingleflight {m : map [string ]struct {}{}}
341
+
342
+ postMetadata := func (mr metadataResultAndKey ) {
343
+ err := a .client .PostMetadata (ctx , mr .key , * mr .result )
344
+ if err != nil {
345
+ a .logger .Error (ctx , "agent failed to report metadata" , slog .Error (err ))
346
+ }
347
+ }
330
348
331
349
for {
332
350
select {
333
351
case <- ctx .Done ():
334
352
return
335
353
case mr := <- metadataResults :
336
- lastCollectedAts [mr .key ] = mr .result .CollectedAt
337
- err := a .client .PostMetadata (ctx , mr .key , * mr .result )
338
- if err != nil {
339
- a .logger .Error (ctx , "agent failed to report metadata" , slog .Error (err ))
340
- }
354
+ postMetadata (mr )
355
+ continue
341
356
case <- baseTicker .C :
342
357
}
343
358
344
359
if len (metadataResults ) > 0 {
345
360
// The inner collection loop expects the channel is empty before spinning up
346
361
// all the collection goroutines.
347
- a .logger .Debug (
348
- ctx , "metadata collection backpressured" ,
362
+ logger .Debug (ctx , "metadata collection backpressured" ,
349
363
slog .F ("queue_len" , len (metadataResults )),
350
364
)
351
365
continue
@@ -357,7 +371,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
357
371
}
358
372
359
373
if len (manifest .Metadata ) > metadataLimit {
360
- a . logger .Error (
374
+ logger .Error (
361
375
ctx , "metadata limit exceeded" ,
362
376
slog .F ("limit" , metadataLimit ), slog .F ("got" , len (manifest .Metadata )),
363
377
)
@@ -367,51 +381,79 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
367
381
// If the manifest changes (e.g. on agent reconnect) we need to
368
382
// purge old cache values to prevent lastCollectedAt from growing
369
383
// boundlessly.
384
+ lastCollectedAtMu .Lock ()
370
385
for key := range lastCollectedAts {
371
386
if slices .IndexFunc (manifest .Metadata , func (md codersdk.WorkspaceAgentMetadataDescription ) bool {
372
387
return md .Key == key
373
388
}) < 0 {
389
+ logger .Debug (ctx , "deleting lastCollected key, missing from manifest" ,
390
+ slog .F ("key" , key ),
391
+ )
374
392
delete (lastCollectedAts , key )
375
393
}
376
394
}
395
+ lastCollectedAtMu .Unlock ()
377
396
378
397
// Spawn a goroutine for each metadata collection, and use a
379
398
// channel to synchronize the results and avoid both messy
380
399
// mutex logic and overloading the API.
381
400
for _ , md := range manifest .Metadata {
382
- collectedAt , ok := lastCollectedAts [md .Key ]
383
- if ok {
384
- // If the interval is zero, we assume the user just wants
385
- // a single collection at startup, not a spinning loop.
386
- if md .Interval == 0 {
387
- continue
388
- }
389
- // The last collected value isn't quite stale yet, so we skip it.
390
- if collectedAt .Add (a .reportMetadataInterval ).After (time .Now ()) {
391
- continue
392
- }
393
- }
394
-
395
401
md := md
396
402
// We send the result to the channel in the goroutine to avoid
397
403
// sending the same result multiple times. So, we don't care about
398
404
// the return values.
399
405
go flight .Do (md .Key , func () {
406
+ ctx := slog .With (ctx , slog .F ("key" , md .Key ))
407
+ lastCollectedAtMu .RLock ()
408
+ collectedAt , ok := lastCollectedAts [md .Key ]
409
+ lastCollectedAtMu .RUnlock ()
410
+ if ok {
411
+ // If the interval is zero, we assume the user just wants
412
+ // a single collection at startup, not a spinning loop.
413
+ if md .Interval == 0 {
414
+ return
415
+ }
416
+ intervalUnit := time .Second
417
+ // reportMetadataInterval is only less than a second in tests,
418
+ // so adjust the interval unit for them.
419
+ if a .reportMetadataInterval < time .Second {
420
+ intervalUnit = 100 * time .Millisecond
421
+ }
422
+ // The last collected value isn't quite stale yet, so we skip it.
423
+ if collectedAt .Add (time .Duration (md .Interval ) * intervalUnit ).After (time .Now ()) {
424
+ return
425
+ }
426
+ }
427
+
400
428
timeout := md .Timeout
401
429
if timeout == 0 {
402
- timeout = md .Interval
430
+ if md .Interval != 0 {
431
+ timeout = md .Interval
432
+ } else if interval := int64 (a .reportMetadataInterval .Seconds ()); interval != 0 {
433
+ // Fallback to the report interval
434
+ timeout = interval * 3
435
+ } else {
436
+ // If the interval is still 0 (possible if the interval
437
+ // is less than a second), default to 5. This was
438
+ // randomly picked.
439
+ timeout = 5
440
+ }
403
441
}
404
- ctx , cancel := context .WithTimeout (ctx ,
405
- time .Duration (timeout )* time .Second ,
406
- )
442
+ ctxTimeout := time .Duration (timeout ) * time .Second
443
+ ctx , cancel := context .WithTimeout (ctx , ctxTimeout )
407
444
defer cancel ()
408
445
446
+ now := time .Now ()
409
447
select {
410
448
case <- ctx .Done ():
449
+ logger .Warn (ctx , "metadata collection timed out" , slog .F ("timeout" , ctxTimeout ))
411
450
case metadataResults <- metadataResultAndKey {
412
451
key : md .Key ,
413
- result : a .collectMetadata (ctx , md ),
452
+ result : a .collectMetadata (ctx , md , now ),
414
453
}:
454
+ lastCollectedAtMu .Lock ()
455
+ lastCollectedAts [md .Key ] = now
456
+ lastCollectedAtMu .Unlock ()
415
457
}
416
458
})
417
459
}
0 commit comments