diff --git a/agent/reaper/doc.go b/agent/reaper/doc.go new file mode 100644 index 0000000000000..b4a8a42910a4c --- /dev/null +++ b/agent/reaper/doc.go @@ -0,0 +1,4 @@ +// Package reaper contains logic for reaping subprocesses. It is +// specifically used in the agent to avoid the accumulation of +// zombie processes. +package reaper diff --git a/agent/reaper/reaper_stub.go b/agent/reaper/reaper_stub.go new file mode 100644 index 0000000000000..9021165c975d8 --- /dev/null +++ b/agent/reaper/reaper_stub.go @@ -0,0 +1,19 @@ +//go:build !linux + +package reaper + +import "github.com/hashicorp/go-reap" + +// IsChild returns true if we're the forked process. +func IsChild() bool { + return false +} + +// IsInitProcess returns true if the current process's PID is 1. +func IsInitProcess() bool { + return false +} + +func ForkReap(pids reap.PidCh) error { + return nil +} diff --git a/agent/reaper/reaper_test.go b/agent/reaper/reaper_test.go new file mode 100644 index 0000000000000..8e9ca9143d3e3 --- /dev/null +++ b/agent/reaper/reaper_test.go @@ -0,0 +1,66 @@ +//go:build linux + +package reaper_test + +import ( + "os" + "os/exec" + "testing" + "time" + + "github.com/hashicorp/go-reap" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/agent/reaper" +) + +func TestReap(t *testing.T) { + t.Parallel() + + // Don't run the reaper test in CI. It does weird + // things like forkexecing which may have unintended + // consequences in CI. + if _, ok := os.LookupEnv("CI"); ok { + t.Skip("Detected CI, skipping reaper tests") + } + + // Because we're forkexecing these tests will try to run twice... + if reaper.IsChild() { + t.Skip("I'm a child!") + } + + // OK checks that's the reaper is successfully reaping + // exited processes and passing the PIDs through the shared + // channel. + t.Run("OK", func(t *testing.T) { + pids := make(reap.PidCh, 1) + err := reaper.ForkReap(pids) + require.NoError(t, err) + + cmd := exec.Command("tail", "-f", "/dev/null") + err = cmd.Start() + require.NoError(t, err) + + cmd2 := exec.Command("tail", "-f", "/dev/null") + err = cmd2.Start() + require.NoError(t, err) + + err = cmd.Process.Kill() + require.NoError(t, err) + + err = cmd2.Process.Kill() + require.NoError(t, err) + + expectedPIDs := []int{cmd.Process.Pid, cmd2.Process.Pid} + + deadline := time.NewTimer(time.Second * 5) + for i := 0; i < len(expectedPIDs); i++ { + select { + case <-deadline.C: + t.Fatalf("Timed out waiting for process") + case pid := <-pids: + require.Contains(t, expectedPIDs, pid) + } + } + }) +} diff --git a/agent/reaper/reaper_unix.go b/agent/reaper/reaper_unix.go new file mode 100644 index 0000000000000..8a2a8656857a4 --- /dev/null +++ b/agent/reaper/reaper_unix.go @@ -0,0 +1,79 @@ +//go:build linux + +package reaper + +import ( + "fmt" + "os" + "syscall" + + "github.com/hashicorp/go-reap" + "golang.org/x/xerrors" +) + +// agentEnvMark is a simple environment variable that we use as a marker +// to indicated that the process is a child as opposed to the reaper. +// Since we are forkexec'ing we need to be able to differentiate between +// the two to avoid fork bombing ourselves. +const agentEnvMark = "CODER_DO_NOT_REAP" + +// IsChild returns true if we're the forked process. +func IsChild() bool { + return os.Getenv(agentEnvMark) != "" +} + +// IsInitProcess returns true if the current process's PID is 1. +func IsInitProcess() bool { + return os.Getpid() == 1 +} + +// ForkReap spawns a goroutine that reaps children. In order to avoid +// complications with spawning `exec.Commands` in the same process that +// is reaping, we forkexec a child process. This prevents a race between +// the reaper and an exec.Command waiting for its process to complete. +// The provided 'pids' channel may be nil if the caller does not care about the +// reaped children PIDs. +func ForkReap(pids reap.PidCh) error { + // Check if the process is the parent or the child. + // If it's the child we want to skip attempting to reap. + if IsChild() { + return nil + } + + go reap.ReapChildren(pids, nil, nil, nil) + + args := os.Args + // This is simply done to help identify the real agent process + // when viewing in something like 'ps'. + args = append(args, "#Agent") + + pwd, err := os.Getwd() + if err != nil { + return xerrors.Errorf("get wd: %w", err) + } + + pattrs := &syscall.ProcAttr{ + Dir: pwd, + // Add our marker for identifying the child process. + Env: append(os.Environ(), fmt.Sprintf("%s=true", agentEnvMark)), + Sys: &syscall.SysProcAttr{ + Setsid: true, + }, + Files: []uintptr{ + uintptr(syscall.Stdin), + uintptr(syscall.Stdout), + uintptr(syscall.Stderr), + }, + } + + //#nosec G204 + pid, _ := syscall.ForkExec(args[0], args, pattrs) + + var wstatus syscall.WaitStatus + _, err = syscall.Wait4(pid, &wstatus, 0, nil) + for xerrors.Is(err, syscall.EINTR) { + _, err = syscall.Wait4(pid, &wstatus, 0, nil) + } + + return nil +} diff --git a/cli/agent.go b/cli/agent.go index 287f18f7fbefa..4e0e87209846e 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -7,6 +7,7 @@ import ( "net/url" "os" "path/filepath" + "runtime" "time" "cloud.google.com/go/compute/metadata" @@ -17,6 +18,7 @@ import ( "cdr.dev/slog/sloggers/sloghuman" "github.com/coder/coder/agent" + "github.com/coder/coder/agent/reaper" "github.com/coder/coder/cli/cliflag" "github.com/coder/coder/codersdk" "github.com/coder/retry" @@ -50,6 +52,23 @@ func workspaceAgent() *cobra.Command { } defer logWriter.Close() logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug) + + isLinux := runtime.GOOS == "linux" + + // Spawn a reaper so that we don't accumulate a ton + // of zombie processes. + if reaper.IsInitProcess() && !reaper.IsChild() && isLinux { + logger.Info(cmd.Context(), "spawning reaper process") + err := reaper.ForkReap(nil) + if err != nil { + logger.Error(cmd.Context(), "failed to reap", slog.Error(err)) + return xerrors.Errorf("fork reap: %w", err) + } + + logger.Info(cmd.Context(), "reaper process exiting") + return nil + } + client := codersdk.New(coderURL) if pprofEnabled { diff --git a/go.mod b/go.mod index a7029e2df5d73..760acc82d0a39 100644 --- a/go.mod +++ b/go.mod @@ -73,6 +73,7 @@ require ( github.com/golang-migrate/migrate/v4 v4.15.2 github.com/google/go-github/v43 v43.0.1-0.20220414155304-00e42332e405 github.com/google/uuid v1.3.0 + github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b github.com/hashicorp/go-version v1.5.0 github.com/hashicorp/hc-install v0.3.2 github.com/hashicorp/hcl/v2 v2.12.0 @@ -134,8 +135,6 @@ require ( storj.io/drpc v0.0.30 ) -require github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect - require ( github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.5.2 // indirect @@ -236,6 +235,7 @@ require ( github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/xeipuuv/gojsonschema v1.2.0 // indirect + github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect github.com/yashtewari/glob-intersection v0.1.0 // indirect github.com/yuin/goldmark v1.4.12 // indirect github.com/zclconf/go-cty v1.10.0 // indirect diff --git a/go.sum b/go.sum index 26d6733bc3776..9cf0052fdcf9a 100644 --- a/go.sum +++ b/go.sum @@ -899,6 +899,8 @@ github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHh github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b h1:3GrpnZQBxcMj1gCXQLelfjCT1D5MPGTuGMKHVzSIH6A= +github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b/go.mod h1:qIFzeFcJU3OIFk/7JreWXcUjFmcCaeHTH9KoNyHYVCs= github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=