From 91fa90141d807c50a113451639ecc7d3633d2a48 Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 22:33:37 +0000 Subject: [PATCH 1/7] Log job id for history failure --- provisionerd/provisionerd.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index 051515304776f..034a13940df1a 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -187,6 +187,7 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { p.acquiredJobDone = make(chan struct{}) p.opts.Logger.Info(context.Background(), "acquired job", + slog.F("job_id", p.acquiredJob.JobId) slog.F("organization_name", p.acquiredJob.OrganizationName), slog.F("project_name", p.acquiredJob.ProjectName), slog.F("username", p.acquiredJob.UserName), @@ -328,7 +329,7 @@ func (p *provisionerDaemon) runJob(ctx context.Context) { } p.acquiredJobCancel() - p.opts.Logger.Info(context.Background(), "completed job") + p.opts.Logger.Info(context.Background(), "completed job", slog.F("job_id", p.acquiredJob.JobId)) } func (p *provisionerDaemon) runProjectImport(ctx context.Context, provisioner sdkproto.DRPCProvisionerClient, job *proto.AcquiredJob_ProjectImport_) { From 576e8fb4136f4eb8d883d74efa61a76ce3004c4d Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 22:46:15 +0000 Subject: [PATCH 2/7] Add additional logging --- provisioner/terraform/provision.go | 4 ++++ provisionerd/provisionerd.go | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/provisioner/terraform/provision.go b/provisioner/terraform/provision.go index e528abaaf44ea..d4cb3d2eb46f4 100644 --- a/provisioner/terraform/provision.go +++ b/provisioner/terraform/provision.go @@ -12,6 +12,8 @@ import ( "github.com/hashicorp/terraform-exec/tfexec" "golang.org/x/xerrors" + "cdr.dev/slog" + "github.com/coder/coder/provisionersdk/proto" ) @@ -149,6 +151,7 @@ func (t *terraform) Provision(request *proto.Provision_Request, stream proto.DRP resources := make([]*proto.Resource, 0) if state.Values != nil { for _, resource := range state.Values.RootModule.Resources { + t.logger.Debug(ctx, "appending state file", slog.F("name", resource.Name), slog.F("type", resource.Type)) resources = append(resources, &proto.Resource{ Name: resource.Name, Type: resource.Type, @@ -156,6 +159,7 @@ func (t *terraform) Provision(request *proto.Provision_Request, stream proto.DRP } } + t.logger.Debug(ctx, "sending completion response") return stream.Send(&proto.Provision_Response{ Type: &proto.Provision_Response_Complete{ Complete: &proto.Provision_Complete{ diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index 034a13940df1a..c0baa63e3c369 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -187,11 +187,12 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { p.acquiredJobDone = make(chan struct{}) p.opts.Logger.Info(context.Background(), "acquired job", - slog.F("job_id", p.acquiredJob.JobId) + slog.F("job_id", p.acquiredJob.JobId), slog.F("organization_name", p.acquiredJob.OrganizationName), slog.F("project_name", p.acquiredJob.ProjectName), slog.F("username", p.acquiredJob.UserName), slog.F("provisioner", p.acquiredJob.Provisioner), + slog.F("job_type", p.acquiredJob.Type), ) go p.runJob(ctx) From 6c28118d02f6ee31c9205cad968fa273291afd8c Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 22:46:42 +0000 Subject: [PATCH 3/7] bump From d9ed06d596b8b400b98d3ebe8945a8dcda550d0a Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 23:16:17 +0000 Subject: [PATCH 4/7] Fix data race in provisioner p.acquiredJobDone chan --- provisionerd/provisionerd.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index c0baa63e3c369..bf47680af11fb 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -156,6 +156,9 @@ func (p *provisionerDaemon) connect(ctx context.Context) { // Locks a job in the database, and runs it! func (p *provisionerDaemon) acquireJob(ctx context.Context) { + if p.isClosed() { + return + } p.acquiredJobMutex.Lock() defer p.acquiredJobMutex.Unlock() if p.isRunningJob() { @@ -174,9 +177,6 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { p.opts.Logger.Warn(context.Background(), "acquire job", slog.Error(err)) return } - if p.isClosed() { - return - } if p.acquiredJob.JobId == "" { p.opts.Logger.Debug(context.Background(), "no jobs available") return @@ -184,6 +184,7 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { ctx, p.acquiredJobCancel = context.WithCancel(ctx) p.acquiredJobCancelled.Store(false) p.acquiredJobRunning.Store(true) + p.acquiredJobDone = make(chan struct{}) p.opts.Logger.Info(context.Background(), "acquired job", @@ -234,8 +235,6 @@ func (p *provisionerDaemon) runJob(ctx context.Context) { return } p.opts.Logger.Debug(ctx, "cleaned up work directory") - p.acquiredJobMutex.Lock() - defer p.acquiredJobMutex.Unlock() p.acquiredJobRunning.Store(false) close(p.acquiredJobDone) }() @@ -510,11 +509,22 @@ func (p *provisionerDaemon) Close() error { func (p *provisionerDaemon) closeWithError(err error) error { p.closeMutex.Lock() defer p.closeMutex.Unlock() + if p.isClosed() { return p.closeError } if p.isRunningJob() { + + // We also need the 'acquire job' mutex here, + // so that a new `p.acquiredJobDone` channel isn't created + // while we're waiting on the mutex. + + // Note the mutex order - it's important that we always use the same order of acquisition + // to avoid deadlocks + p.acquiredJobMutex.Lock() + defer p.acquiredJobMutex.Unlock() + errMsg := "provisioner daemon was shutdown gracefully" if err != nil { errMsg = err.Error() From 42ce721be04dd5bda0c69227af1a02896b9468fd Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 23:20:07 +0000 Subject: [PATCH 5/7] Try out race fix for acquiredJobMutex --- provisionerd/provisionerd.go | 1 - 1 file changed, 1 deletion(-) diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index bf47680af11fb..d98eca2663628 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -515,7 +515,6 @@ func (p *provisionerDaemon) closeWithError(err error) error { } if p.isRunningJob() { - // We also need the 'acquire job' mutex here, // so that a new `p.acquiredJobDone` channel isn't created // while we're waiting on the mutex. From 84dd68aa39413ebd6ee2c7206dead04aac89e73a Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 23:27:26 +0000 Subject: [PATCH 6/7] Try #2 - don't create chan every acquirejob --- provisionerd/provisionerd.go | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index d98eca2663628..e0d2b2fcfecf6 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -50,12 +50,12 @@ func New(clientDialer Dialer, opts *Options) io.Closer { } ctx, ctxCancel := context.WithCancel(context.Background()) daemon := &provisionerDaemon{ - clientDialer: clientDialer, - opts: opts, - - closeContext: ctx, - closeCancel: ctxCancel, - closed: make(chan struct{}), + clientDialer: clientDialer, + opts: opts, + acquiredJobDone: make(chan struct{}), + closeContext: ctx, + closeCancel: ctxCancel, + closed: make(chan struct{}), } go daemon.connect(ctx) return daemon @@ -185,8 +185,6 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { p.acquiredJobCancelled.Store(false) p.acquiredJobRunning.Store(true) - p.acquiredJobDone = make(chan struct{}) - p.opts.Logger.Info(context.Background(), "acquired job", slog.F("job_id", p.acquiredJob.JobId), slog.F("organization_name", p.acquiredJob.OrganizationName), @@ -515,15 +513,6 @@ func (p *provisionerDaemon) closeWithError(err error) error { } if p.isRunningJob() { - // We also need the 'acquire job' mutex here, - // so that a new `p.acquiredJobDone` channel isn't created - // while we're waiting on the mutex. - - // Note the mutex order - it's important that we always use the same order of acquisition - // to avoid deadlocks - p.acquiredJobMutex.Lock() - defer p.acquiredJobMutex.Unlock() - errMsg := "provisioner daemon was shutdown gracefully" if err != nil { errMsg = err.Error() From a8725cd495e69a65152876ea5991118f6eed83f4 Mon Sep 17 00:00:00 2001 From: Bryan Phelps Date: Wed, 2 Feb 2022 23:36:13 +0000 Subject: [PATCH 7/7] Can't close a channel multiple times... switch to waitgroup --- provisionerd/provisionerd.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index e0d2b2fcfecf6..4153fcfb4be5e 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -50,12 +50,11 @@ func New(clientDialer Dialer, opts *Options) io.Closer { } ctx, ctxCancel := context.WithCancel(context.Background()) daemon := &provisionerDaemon{ - clientDialer: clientDialer, - opts: opts, - acquiredJobDone: make(chan struct{}), - closeContext: ctx, - closeCancel: ctxCancel, - closed: make(chan struct{}), + clientDialer: clientDialer, + opts: opts, + closeContext: ctx, + closeCancel: ctxCancel, + closed: make(chan struct{}), } go daemon.connect(ctx) return daemon @@ -90,7 +89,7 @@ type provisionerDaemon struct { acquiredJobCancel context.CancelFunc acquiredJobCancelled atomic.Bool acquiredJobRunning atomic.Bool - acquiredJobDone chan struct{} + acquiredJobGroup sync.WaitGroup } // Connect establishes a connection to coderd. @@ -182,6 +181,7 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) { return } ctx, p.acquiredJobCancel = context.WithCancel(ctx) + p.acquiredJobGroup.Add(1) p.acquiredJobCancelled.Store(false) p.acquiredJobRunning.Store(true) @@ -234,7 +234,7 @@ func (p *provisionerDaemon) runJob(ctx context.Context) { } p.opts.Logger.Debug(ctx, "cleaned up work directory") p.acquiredJobRunning.Store(false) - close(p.acquiredJobDone) + p.acquiredJobGroup.Done() }() // It's safe to cast this ProvisionerType. This data is coming directly from coderd. provisioner, hasProvisioner := p.opts.Provisioners[p.acquiredJob.Provisioner] @@ -520,7 +520,7 @@ func (p *provisionerDaemon) closeWithError(err error) error { if !p.acquiredJobCancelled.Load() { p.cancelActiveJob(errMsg) } - <-p.acquiredJobDone + p.acquiredJobGroup.Wait() } p.opts.Logger.Debug(context.Background(), "closing server with error", slog.Error(err))