Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2eab1b5

Browse files
authored
fix: Allow provisionerd to cleanup acquired job (#159)
If a job is acquired from the database, then provisionerd was killed, the job would be left in an idle state where it was technically in-progress.
1 parent 94f71fe commit 2eab1b5

File tree

2 files changed

+54
-65
lines changed

2 files changed

+54
-65
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
"goleak",
3535
"hashicorp",
3636
"httpmw",
37+
"Jobf",
3738
"moby",
3839
"nhooyr",
3940
"nolint",

provisionerd/provisionerd.go

Lines changed: 53 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,8 @@ func New(clientDialer Dialer, opts *Options) io.Closer {
5151
clientDialer: clientDialer,
5252
opts: opts,
5353

54-
closeContext: ctx,
55-
closeCancel: ctxCancel,
56-
closed: make(chan struct{}),
54+
closeCancel: ctxCancel,
55+
closed: make(chan struct{}),
5756

5857
jobRunning: make(chan struct{}),
5958
}
@@ -71,23 +70,21 @@ type provisionerDaemon struct {
7170
client proto.DRPCProvisionerDaemonClient
7271
updateStream proto.DRPCProvisionerDaemon_UpdateJobClient
7372

74-
closeContext context.Context
75-
closeCancel context.CancelFunc
76-
closed chan struct{}
77-
closeMutex sync.Mutex
78-
closeError error
73+
// Locked when closing the daemon.
74+
closeMutex sync.Mutex
75+
closeCancel context.CancelFunc
76+
closed chan struct{}
77+
closeError error
7978

80-
jobID string
79+
// Locked when acquiring or canceling a job.
8180
jobMutex sync.Mutex
81+
jobID string
8282
jobRunning chan struct{}
8383
jobCancel context.CancelFunc
8484
}
8585

8686
// Connect establishes a connection to coderd.
8787
func (p *provisionerDaemon) connect(ctx context.Context) {
88-
p.jobMutex.Lock()
89-
defer p.jobMutex.Unlock()
90-
9188
var err error
9289
// An exponential back-off occurs when the connection is failing to dial.
9390
// This is to prevent server spam in case of a coderd outage.
@@ -102,6 +99,9 @@ func (p *provisionerDaemon) connect(ctx context.Context) {
10299
}
103100
p.updateStream, err = p.client.UpdateJob(ctx)
104101
if err != nil {
102+
if errors.Is(err, context.Canceled) {
103+
return
104+
}
105105
p.opts.Logger.Warn(context.Background(), "create update job stream", slog.Error(err))
106106
continue
107107
}
@@ -126,12 +126,6 @@ func (p *provisionerDaemon) connect(ctx context.Context) {
126126
// has been interrupted. This works well, because logs need
127127
// to buffer if a job is running in the background.
128128
p.opts.Logger.Debug(context.Background(), "update stream ended", slog.Error(p.updateStream.Context().Err()))
129-
// Make sure we're not closing here!
130-
p.closeMutex.Lock()
131-
defer p.closeMutex.Unlock()
132-
if p.isClosed() {
133-
return
134-
}
135129
p.connect(ctx)
136130
}
137131
}()
@@ -168,6 +162,9 @@ func (p *provisionerDaemon) isRunningJob() bool {
168162
func (p *provisionerDaemon) acquireJob(ctx context.Context) {
169163
p.jobMutex.Lock()
170164
defer p.jobMutex.Unlock()
165+
if p.isClosed() {
166+
return
167+
}
171168
if p.isRunningJob() {
172169
p.opts.Logger.Debug(context.Background(), "skipping acquire; job is already running")
173170
return
@@ -184,15 +181,10 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) {
184181
p.opts.Logger.Warn(context.Background(), "acquire job", slog.Error(err))
185182
return
186183
}
187-
if p.isClosed() {
188-
return
189-
}
190184
if job.JobId == "" {
191185
p.opts.Logger.Debug(context.Background(), "no jobs available")
192186
return
193187
}
194-
p.closeMutex.Lock()
195-
defer p.closeMutex.Unlock()
196188
ctx, p.jobCancel = context.WithCancel(ctx)
197189
p.jobRunning = make(chan struct{})
198190
p.jobID = job.JobId
@@ -222,31 +214,27 @@ func (p *provisionerDaemon) runJob(ctx context.Context, job *proto.AcquiredJob)
222214
JobId: job.JobId,
223215
})
224216
if err != nil {
225-
go p.cancelActiveJob(fmt.Sprintf("send periodic update: %s", err))
217+
go p.cancelActiveJobf("send periodic update: %s", err)
226218
return
227219
}
228220
}
229221
}()
230222
defer func() {
231223
// Cleanup the work directory after execution.
232224
err := os.RemoveAll(p.opts.WorkDirectory)
233-
if err != nil {
234-
go p.cancelActiveJob(fmt.Sprintf("remove all from %q directory: %s", p.opts.WorkDirectory, err))
235-
return
236-
}
237-
p.opts.Logger.Debug(ctx, "cleaned up work directory")
225+
p.opts.Logger.Debug(ctx, "cleaned up work directory", slog.Error(err))
238226
close(p.jobRunning)
239227
}()
240228
// It's safe to cast this ProvisionerType. This data is coming directly from coderd.
241229
provisioner, hasProvisioner := p.opts.Provisioners[job.Provisioner]
242230
if !hasProvisioner {
243-
go p.cancelActiveJob(fmt.Sprintf("provisioner %q not registered", job.Provisioner))
231+
go p.cancelActiveJobf("provisioner %q not registered", job.Provisioner)
244232
return
245233
}
246234

247235
err := os.MkdirAll(p.opts.WorkDirectory, 0700)
248236
if err != nil {
249-
go p.cancelActiveJob(fmt.Sprintf("create work directory %q: %s", p.opts.WorkDirectory, err))
237+
go p.cancelActiveJobf("create work directory %q: %s", p.opts.WorkDirectory, err)
250238
return
251239
}
252240

@@ -258,13 +246,13 @@ func (p *provisionerDaemon) runJob(ctx context.Context, job *proto.AcquiredJob)
258246
break
259247
}
260248
if err != nil {
261-
go p.cancelActiveJob(fmt.Sprintf("read project source archive: %s", err))
249+
go p.cancelActiveJobf("read project source archive: %s", err)
262250
return
263251
}
264252
// #nosec
265253
path := filepath.Join(p.opts.WorkDirectory, header.Name)
266254
if !strings.HasPrefix(path, filepath.Clean(p.opts.WorkDirectory)) {
267-
go p.cancelActiveJob("tar attempts to target relative upper directory")
255+
go p.cancelActiveJobf("tar attempts to target relative upper directory")
268256
return
269257
}
270258
mode := header.FileInfo().Mode()
@@ -275,14 +263,14 @@ func (p *provisionerDaemon) runJob(ctx context.Context, job *proto.AcquiredJob)
275263
case tar.TypeDir:
276264
err = os.MkdirAll(path, mode)
277265
if err != nil {
278-
go p.cancelActiveJob(fmt.Sprintf("mkdir %q: %s", path, err))
266+
go p.cancelActiveJobf("mkdir %q: %s", path, err)
279267
return
280268
}
281269
p.opts.Logger.Debug(context.Background(), "extracted directory", slog.F("path", path))
282270
case tar.TypeReg:
283271
file, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, mode)
284272
if err != nil {
285-
go p.cancelActiveJob(fmt.Sprintf("create file %q (mode %s): %s", path, mode, err))
273+
go p.cancelActiveJobf("create file %q (mode %s): %s", path, mode, err)
286274
return
287275
}
288276
// Max file size of 10MB.
@@ -291,12 +279,12 @@ func (p *provisionerDaemon) runJob(ctx context.Context, job *proto.AcquiredJob)
291279
err = nil
292280
}
293281
if err != nil {
294-
go p.cancelActiveJob(fmt.Sprintf("copy file %q: %s", path, err))
282+
go p.cancelActiveJobf("copy file %q: %s", path, err)
295283
return
296284
}
297285
err = file.Close()
298286
if err != nil {
299-
go p.cancelActiveJob(fmt.Sprintf("close file %q: %s", path, err))
287+
go p.cancelActiveJobf("close file %q: %s", path, err)
300288
return
301289
}
302290
p.opts.Logger.Debug(context.Background(), "extracted file",
@@ -323,7 +311,7 @@ func (p *provisionerDaemon) runJob(ctx context.Context, job *proto.AcquiredJob)
323311

324312
p.runWorkspaceProvision(ctx, provisioner, job)
325313
default:
326-
go p.cancelActiveJob(fmt.Sprintf("unknown job type %q; ensure your provisioner daemon is up-to-date", reflect.TypeOf(job.Type).String()))
314+
go p.cancelActiveJobf("unknown job type %q; ensure your provisioner daemon is up-to-date", reflect.TypeOf(job.Type).String())
327315
return
328316
}
329317

@@ -335,14 +323,14 @@ func (p *provisionerDaemon) runProjectImport(ctx context.Context, provisioner sd
335323
Directory: p.opts.WorkDirectory,
336324
})
337325
if err != nil {
338-
go p.cancelActiveJob(fmt.Sprintf("parse source: %s", err))
326+
go p.cancelActiveJobf("parse source: %s", err)
339327
return
340328
}
341329
defer stream.Close()
342330
for {
343331
msg, err := stream.Recv()
344332
if err != nil {
345-
go p.cancelActiveJob(fmt.Sprintf("recv parse source: %s", err))
333+
go p.cancelActiveJobf("recv parse source: %s", err)
346334
return
347335
}
348336
switch msgType := msg.Type.(type) {
@@ -363,7 +351,7 @@ func (p *provisionerDaemon) runProjectImport(ctx context.Context, provisioner sd
363351
}},
364352
})
365353
if err != nil {
366-
go p.cancelActiveJob(fmt.Sprintf("update job: %s", err))
354+
go p.cancelActiveJobf("update job: %s", err)
367355
return
368356
}
369357
case *sdkproto.Parse_Response_Complete:
@@ -379,14 +367,14 @@ func (p *provisionerDaemon) runProjectImport(ctx context.Context, provisioner sd
379367
},
380368
})
381369
if err != nil {
382-
go p.cancelActiveJob(fmt.Sprintf("complete job: %s", err))
370+
go p.cancelActiveJobf("complete job: %s", err)
383371
return
384372
}
385373
// Return so we stop looping!
386374
return
387375
default:
388-
go p.cancelActiveJob(fmt.Sprintf("invalid message type %q received from provisioner",
389-
reflect.TypeOf(msg.Type).String()))
376+
go p.cancelActiveJobf("invalid message type %q received from provisioner",
377+
reflect.TypeOf(msg.Type).String())
390378
return
391379
}
392380
}
@@ -399,15 +387,15 @@ func (p *provisionerDaemon) runWorkspaceProvision(ctx context.Context, provision
399387
State: job.GetWorkspaceProvision().State,
400388
})
401389
if err != nil {
402-
go p.cancelActiveJob(fmt.Sprintf("provision: %s", err))
390+
go p.cancelActiveJobf("provision: %s", err)
403391
return
404392
}
405393
defer stream.Close()
406394

407395
for {
408396
msg, err := stream.Recv()
409397
if err != nil {
410-
go p.cancelActiveJob(fmt.Sprintf("recv workspace provision: %s", err))
398+
go p.cancelActiveJobf("recv workspace provision: %s", err)
411399
return
412400
}
413401
switch msgType := msg.Type.(type) {
@@ -428,7 +416,7 @@ func (p *provisionerDaemon) runWorkspaceProvision(ctx context.Context, provision
428416
}},
429417
})
430418
if err != nil {
431-
go p.cancelActiveJob(fmt.Sprintf("send job update: %s", err))
419+
go p.cancelActiveJobf("send job update: %s", err)
432420
return
433421
}
434422
case *sdkproto.Provision_Response_Complete:
@@ -450,26 +438,28 @@ func (p *provisionerDaemon) runWorkspaceProvision(ctx context.Context, provision
450438
},
451439
})
452440
if err != nil {
453-
go p.cancelActiveJob(fmt.Sprintf("complete job: %s", err))
441+
go p.cancelActiveJobf("complete job: %s", err)
454442
return
455443
}
456444
// Return so we stop looping!
457445
return
458446
default:
459-
go p.cancelActiveJob(fmt.Sprintf("invalid message type %q received from provisioner",
460-
reflect.TypeOf(msg.Type).String()))
447+
go p.cancelActiveJobf("invalid message type %q received from provisioner",
448+
reflect.TypeOf(msg.Type).String())
461449
return
462450
}
463451
}
464452
}
465453

466-
func (p *provisionerDaemon) cancelActiveJob(errMsg string) {
454+
func (p *provisionerDaemon) cancelActiveJobf(format string, args ...interface{}) {
467455
p.jobMutex.Lock()
468456
defer p.jobMutex.Unlock()
469-
if p.isClosed() {
470-
return
471-
}
457+
errMsg := fmt.Sprintf(format, args...)
472458
if !p.isRunningJob() {
459+
if p.isClosed() {
460+
// We don't want to log if we're already closed!
461+
return
462+
}
473463
p.opts.Logger.Warn(context.Background(), "skipping job cancel; none running", slog.F("error_message", errMsg))
474464
return
475465
}
@@ -512,22 +502,20 @@ func (p *provisionerDaemon) closeWithError(err error) error {
512502
if p.isClosed() {
513503
return p.closeError
514504
}
515-
p.closeCancel()
505+
p.closeError = err
506+
close(p.closed)
507+
516508
errMsg := "provisioner daemon was shutdown gracefully"
517509
if err != nil {
518510
errMsg = err.Error()
519511
}
520-
p.cancelActiveJob(errMsg)
521-
p.jobMutex.Lock()
522-
defer p.jobMutex.Unlock()
523-
p.opts.Logger.Debug(context.Background(), "closing server with error", slog.Error(err))
524-
p.closeError = err
525-
close(p.closed)
512+
p.cancelActiveJobf(errMsg)
513+
p.closeCancel()
526514

527-
if p.updateStream != nil {
528-
_ = p.client.DRPCConn().Close()
529-
_ = p.updateStream.Close()
530-
}
515+
// Required until we're on Go 1.18. See:
516+
// https://github.com/golang/go/issues/50510
517+
_ = os.RemoveAll(p.opts.WorkDirectory)
518+
p.opts.Logger.Debug(context.Background(), "closing server with error", slog.Error(err))
531519

532520
return err
533521
}

0 commit comments

Comments
 (0)