diff --git a/cli/agent.go b/cli/agent.go index cbe62c77a6b4e..baf55eccea222 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command { ctx, cancel := context.WithCancel(cmd.Context()) defer cancel() + go dumpHandler(ctx) + rawURL, err := cmd.Flags().GetString(varAgentURL) if err != nil { return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err) diff --git a/cli/root.go b/cli/root.go index 190bb35ac8a63..e7fdab6b91264 100644 --- a/cli/root.go +++ b/cli/root.go @@ -8,8 +8,11 @@ import ( "net/http" "net/url" "os" + "os/signal" + "path/filepath" "runtime" "strings" + "syscall" "text/template" "time" @@ -631,3 +634,93 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) { } return h.transport.RoundTrip(req) } + +// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the +// stacktrace of all goroutines to stderr and a well-known file in the home +// directory. This is useful for debugging deadlock issues that may occur in +// production in workspaces, since the default Go runtime will only dump to +// stderr (which is often difficult/impossible to read in a workspace). +// +// SIGQUITs will still cause the program to exit (similarly to the default Go +// runtime behavior). +// +// A SIGQUIT handler will not be registered if GOTRACEBACK=crash. +// +// On Windows this immediately returns. +func dumpHandler(ctx context.Context) { + if runtime.GOOS == "windows" { + // free up the goroutine since it'll be permanently blocked anyways + return + } + + listenSignals := []os.Signal{syscall.SIGTRAP} + if os.Getenv("GOTRACEBACK") != "crash" { + listenSignals = append(listenSignals, syscall.SIGQUIT) + } + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, listenSignals...) + defer signal.Stop(sigs) + + for { + sigStr := "" + select { + case <-ctx.Done(): + return + case sig := <-sigs: + switch sig { + case syscall.SIGQUIT: + sigStr = "SIGQUIT" + case syscall.SIGTRAP: + sigStr = "SIGTRAP" + } + } + + // Start with a 1MB buffer and keep doubling it until we can fit the + // entire stacktrace, stopping early once we reach 64MB. + buf := make([]byte, 1_000_000) + stacklen := 0 + for { + stacklen = runtime.Stack(buf, true) + if stacklen < len(buf) { + break + } + if 2*len(buf) > 64_000_000 { + // Write a message to the end of the buffer saying that it was + // truncated. + const truncatedMsg = "\n\n\nstack trace truncated due to size\n" + copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg) + break + } + buf = make([]byte, 2*len(buf)) + } + + _, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen]) + + // Write to a well-known file. + dir, err := os.UserHomeDir() + if err != nil { + dir = os.TempDir() + } + fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z"))) + _, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath) + + f, err := os.Create(fpath) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + goto done + } + _, err = f.Write(buf[:stacklen]) + _ = f.Close() + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error()) + goto done + } + + done: + if sigStr == "SIGQUIT" { + //nolint:revive + os.Exit(1) + } + } +} diff --git a/cli/server.go b/cli/server.go index 2b4e32c17484b..a543f55a0e8c6 100644 --- a/cli/server.go +++ b/cli/server.go @@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co Use: "server", Short: "Start a Coder server", RunE: func(cmd *cobra.Command, args []string) error { + // Main command context for managing cancellation of running + // services. + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + go dumpHandler(ctx) + cfg, err := deployment.Config(cmd.Flags(), vip) if err != nil { return xerrors.Errorf("getting deployment config: %w", err) @@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co logger = logger.AppendSinks(tracing.SlogSink{}) } - // Main command context for managing cancellation - // of running services. - ctx, cancel := context.WithCancel(cmd.Context()) - defer cancel() - // Register signals early on so that graceful shutdown can't // be interrupted by additional signals. Note that we avoid // shadowing cancel() (from above) here because notifyStop()