Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e4964ae

Browse files
stirbyspikecurtis
andauthored
chore: log provider stack traces on text file busy (#15249)
re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by coder/terraform-provider-coder#295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis <[email protected]> (cherry picked from commit d676ad5) Co-authored-by: Spike Curtis <[email protected]>
1 parent f214d03 commit e4964ae

File tree

4 files changed

+235
-5
lines changed

4 files changed

+235
-5
lines changed

provisioner/terraform/executor.go

+44-1
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
200200
return version.NewVersion(vj.Version)
201201
}
202202

203+
type textFileBusyError struct {
204+
exitErr *exec.ExitError
205+
stderr string
206+
}
207+
208+
func (e *textFileBusyError) Error() string {
209+
return "text file busy: " + e.exitErr.String()
210+
}
211+
203212
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
204213
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
205214
defer span.End()
@@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
216225
<-doneErr
217226
}()
218227

228+
// As a special case, we want to look for the error "text file busy" in the stderr output of
229+
// the init command, so we also take a copy of the stderr into an in memory buffer.
230+
errBuf := newBufferedWriteCloser(errWriter)
231+
219232
args := []string{
220233
"init",
221234
"-no-color",
222235
"-input=false",
223236
}
224237

225-
return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
238+
err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
239+
var exitErr *exec.ExitError
240+
if xerrors.As(err, &exitErr) {
241+
if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
242+
return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()}
243+
}
244+
}
245+
return err
226246
}
227247

228248
func getPlanFilePath(workdir string) string {
@@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
707727
defer sw.mut.Unlock()
708728
return sw.w.Write(p)
709729
}
730+
731+
type bufferedWriteCloser struct {
732+
wc io.WriteCloser
733+
b bytes.Buffer
734+
}
735+
736+
func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
737+
return &bufferedWriteCloser{
738+
wc: wc,
739+
}
740+
}
741+
742+
func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
743+
n, err := b.b.Write(p)
744+
if err != nil {
745+
return n, err
746+
}
747+
return b.wc.Write(p)
748+
}
749+
750+
func (b *bufferedWriteCloser) Close() error {
751+
return b.wc.Close()
752+
}

provisioner/terraform/provision.go

+62
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"io"
8+
"net"
9+
"net/http"
710
"os"
11+
"path/filepath"
812
"strings"
913
"time"
1014

@@ -109,10 +113,32 @@ func (s *server) Plan(
109113
initTimings.ingest(createInitTimingsEvent(timingInitStart))
110114

111115
err = e.init(ctx, killCtx, sess)
116+
112117
if err != nil {
113118
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
114119

115120
s.logger.Debug(ctx, "init failed", slog.Error(err))
121+
122+
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
123+
// We believe this might be due to some race condition that prevents the
124+
// terraform-provider-coder process from exiting. When terraform tries to install the
125+
// provider during this init, it copies over the local cache. Normally this isn't an issue,
126+
// but if the terraform-provider-coder process is still running from a previous build, Linux
127+
// returns "text file busy" error when attempting to open the file.
128+
//
129+
// Capturing the stack trace from the process should help us figure out why it has not
130+
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
131+
// notice, and also because it indicates this provisioner could be permanently broken and
132+
// require a restart.
133+
var errTFB *textFileBusyError
134+
if xerrors.As(err, &errTFB) {
135+
stacktrace := tryGettingCoderProviderStacktrace(sess)
136+
s.logger.Critical(ctx, "init: text file busy",
137+
slog.Error(errTFB),
138+
slog.F("stderr", errTFB.stderr),
139+
slog.F("provider_coder_stacktrace", stacktrace),
140+
)
141+
}
116142
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
117143
}
118144

@@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
280306
}
281307
}
282308
}
309+
310+
// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
311+
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
312+
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
313+
// to figure out why it hasn't exited.
314+
func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
315+
path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
316+
sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
317+
c := http.Client{
318+
Transport: &http.Transport{
319+
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
320+
d := net.Dialer{}
321+
return d.DialContext(ctx, "unix", path)
322+
},
323+
},
324+
}
325+
req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet,
326+
"http://localhost/debug/pprof/goroutine?debug=2", nil)
327+
if err != nil {
328+
sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err))
329+
return ""
330+
}
331+
resp, err := c.Do(req)
332+
if err != nil {
333+
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
334+
// in v1.0.4
335+
sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
336+
return ""
337+
}
338+
defer resp.Body.Close()
339+
stacktraces, err := io.ReadAll(resp.Body)
340+
if err != nil {
341+
sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
342+
}
343+
return string(stacktraces)
344+
}

provisioner/terraform/provision_test.go

+89-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"encoding/json"
1010
"errors"
1111
"fmt"
12+
"net"
13+
"net/http"
1214
"os"
1315
"path/filepath"
1416
"runtime"
@@ -31,14 +33,22 @@ import (
3133
type provisionerServeOptions struct {
3234
binaryPath string
3335
exitTimeout time.Duration
36+
workDir string
37+
logger *slog.Logger
3438
}
3539

3640
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
3741
if opts == nil {
3842
opts = &provisionerServeOptions{}
3943
}
4044
cachePath := t.TempDir()
41-
workDir := t.TempDir()
45+
if opts.workDir == "" {
46+
opts.workDir = t.TempDir()
47+
}
48+
if opts.logger == nil {
49+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
50+
opts.logger = &logger
51+
}
4252
client, server := drpc.MemTransportPipe()
4353
ctx, cancelFunc := context.WithCancel(context.Background())
4454
serverErr := make(chan error, 1)
@@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
5565
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
5666
ServeOptions: &provisionersdk.ServeOptions{
5767
Listener: server,
58-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
59-
WorkDirectory: workDir,
68+
Logger: *opts.logger,
69+
WorkDirectory: opts.workDir,
6070
},
6171
BinaryPath: opts.binaryPath,
6272
CachePath: cachePath,
@@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
236246
dir := t.TempDir()
237247
binPath := filepath.Join(dir, "terraform")
238248

239-
// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
249+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
240250
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
241251
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
242252
require.NoError(t, err)
@@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
282292
}
283293
}
284294

295+
// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
296+
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
297+
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
298+
// terraform-provider-coder
299+
// nolint: paralleltest
300+
func TestProvision_TextFileBusy(t *testing.T) {
301+
if runtime.GOOS == "windows" {
302+
t.Skip("This test uses unix sockets and is not supported on Windows")
303+
}
304+
305+
cwd, err := os.Getwd()
306+
require.NoError(t, err)
307+
fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")
308+
309+
dir := t.TempDir()
310+
binPath := filepath.Join(dir, "terraform")
311+
312+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
313+
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
314+
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
315+
require.NoError(t, err)
316+
317+
workDir := t.TempDir()
318+
319+
err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
320+
require.NoError(t, err)
321+
l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
322+
require.NoError(t, err)
323+
defer l.Close()
324+
handlerCalled := 0
325+
// nolint: gosec
326+
srv := &http.Server{
327+
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
328+
assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
329+
w.WriteHeader(http.StatusOK)
330+
_, err := w.Write([]byte("thestacks\n"))
331+
assert.NoError(t, err)
332+
handlerCalled++
333+
}),
334+
}
335+
srvErr := make(chan error, 1)
336+
go func() {
337+
srvErr <- srv.Serve(l)
338+
}()
339+
340+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
341+
ctx, api := setupProvisioner(t, &provisionerServeOptions{
342+
binaryPath: binPath,
343+
exitTimeout: time.Second,
344+
workDir: workDir,
345+
logger: &logger,
346+
})
347+
348+
sess := configure(ctx, t, api, &proto.Config{
349+
TemplateSourceArchive: makeTar(t, nil),
350+
})
351+
352+
err = sendPlan(sess, proto.WorkspaceTransition_START)
353+
require.NoError(t, err)
354+
355+
found := false
356+
for {
357+
msg, err := sess.Recv()
358+
require.NoError(t, err)
359+
360+
if c := msg.GetPlan(); c != nil {
361+
require.Contains(t, c.Error, "exit status 1")
362+
found = true
363+
break
364+
}
365+
}
366+
require.True(t, found)
367+
require.EqualValues(t, 1, handlerCalled)
368+
}
369+
285370
func TestProvision(t *testing.T) {
286371
t.Parallel()
287372

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/sh
2+
3+
VERSION=$1
4+
shift 1
5+
6+
json_print() {
7+
echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
8+
}
9+
10+
case "$1" in
11+
version)
12+
cat <<-EOF
13+
{
14+
"terraform_version": "${VERSION}",
15+
"platform": "linux_amd64",
16+
"provider_selections": {},
17+
"terraform_outdated": false
18+
}
19+
EOF
20+
exit 0
21+
;;
22+
init)
23+
echo "init"
24+
echo >&2 "Error: Failed to install provider"
25+
echo >&2 " Error while installing coder/coder v1.0.4: open"
26+
echo >&2 " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
27+
echo >&2 " text file busy"
28+
exit 1
29+
;;
30+
plan)
31+
echo "plan not supported"
32+
exit 1
33+
;;
34+
apply)
35+
echo "apply not supported"
36+
exit 1
37+
;;
38+
esac
39+
40+
exit 10

0 commit comments

Comments
 (0)