Thanks to visit codestin.com
Credit goes to github.com

Skip to content

fix: Add reaper to coder agent #2441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions agent/reaper/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Package reaper contains logic for reaping subprocesses. It is
// specifically used in the agent to avoid the accumulation of
// zombie processes.
package reaper
19 changes: 19 additions & 0 deletions agent/reaper/reaper_stub.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//go:build !linux

package reaper

import "github.com/hashicorp/go-reap"

// IsChild returns true if we're the forked process.
func IsChild() bool {
return false
}

// IsInitProcess returns true if the current process's PID is 1.
func IsInitProcess() bool {
return false
}

func ForkReap(pids reap.PidCh) error {
return nil
}
66 changes: 66 additions & 0 deletions agent/reaper/reaper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//go:build linux

package reaper_test

import (
"os"
"os/exec"
"testing"
"time"

"github.com/hashicorp/go-reap"
"github.com/stretchr/testify/require"

"github.com/coder/coder/agent/reaper"
)

func TestReap(t *testing.T) {
t.Parallel()

// Don't run the reaper test in CI. It does weird
// things like forkexecing which may have unintended
// consequences in CI.
if _, ok := os.LookupEnv("CI"); ok {
t.Skip("Detected CI, skipping reaper tests")
}

// Because we're forkexecing these tests will try to run twice...
if reaper.IsChild() {
t.Skip("I'm a child!")
}

// OK checks that's the reaper is successfully reaping
// exited processes and passing the PIDs through the shared
// channel.
t.Run("OK", func(t *testing.T) {
pids := make(reap.PidCh, 1)
err := reaper.ForkReap(pids)
require.NoError(t, err)

cmd := exec.Command("tail", "-f", "/dev/null")
err = cmd.Start()
require.NoError(t, err)

cmd2 := exec.Command("tail", "-f", "/dev/null")
err = cmd2.Start()
require.NoError(t, err)

err = cmd.Process.Kill()
require.NoError(t, err)

err = cmd2.Process.Kill()
require.NoError(t, err)

expectedPIDs := []int{cmd.Process.Pid, cmd2.Process.Pid}

deadline := time.NewTimer(time.Second * 5)
for i := 0; i < len(expectedPIDs); i++ {
select {
case <-deadline.C:
t.Fatalf("Timed out waiting for process")
case pid := <-pids:
require.Contains(t, expectedPIDs, pid)
}
}
})
}
79 changes: 79 additions & 0 deletions agent/reaper/reaper_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//go:build linux

package reaper

import (
"fmt"
"os"
"syscall"

"github.com/hashicorp/go-reap"
"golang.org/x/xerrors"
)

// agentEnvMark is a simple environment variable that we use as a marker
// to indicated that the process is a child as opposed to the reaper.
// Since we are forkexec'ing we need to be able to differentiate between
// the two to avoid fork bombing ourselves.
const agentEnvMark = "CODER_DO_NOT_REAP"

// IsChild returns true if we're the forked process.
func IsChild() bool {
return os.Getenv(agentEnvMark) != ""
}

// IsInitProcess returns true if the current process's PID is 1.
func IsInitProcess() bool {
return os.Getpid() == 1
}

// ForkReap spawns a goroutine that reaps children. In order to avoid
// complications with spawning `exec.Commands` in the same process that
// is reaping, we forkexec a child process. This prevents a race between
// the reaper and an exec.Command waiting for its process to complete.
// The provided 'pids' channel may be nil if the caller does not care about the
// reaped children PIDs.
func ForkReap(pids reap.PidCh) error {
// Check if the process is the parent or the child.
// If it's the child we want to skip attempting to reap.
if IsChild() {
return nil
}

go reap.ReapChildren(pids, nil, nil, nil)

args := os.Args
// This is simply done to help identify the real agent process
// when viewing in something like 'ps'.
args = append(args, "#Agent")

pwd, err := os.Getwd()
if err != nil {
return xerrors.Errorf("get wd: %w", err)
}

pattrs := &syscall.ProcAttr{
Dir: pwd,
// Add our marker for identifying the child process.
Env: append(os.Environ(), fmt.Sprintf("%s=true", agentEnvMark)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we filter this out from sub-process envs (i.e. when we launch shells)? It probably doesn't matter for correctness, more of a cleanliness thing.

We could also add CODER_AGENT_PID as part of the sub-process envs for programmatic use (e.g. to enable pprof, kill -USR1 $CODER_AGENT_PID). We'd need to do that always though, not only when forking.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that it's certainly cleaner to filter it out. The only reason I didn't is because I did not want implementation details of the reaper package to leak outside. In other words, if you're developing the agent I don't want people to have to think about using a special function that filters out arcane environment variables that you don't understand.

I think if it becomes an issue we devote some time to figuring out something unobtrusive.

Copy link
Collaborator Author

@sreya sreya Jun 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already merged this but I think a cleaner solution is to have a --noreap flag that we append to the forkexec cmd. Then we don't pollute subprocess envs or leak to the world whats happening under the hood. What do you think @mafredri

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense and I agree that's a clean solution @sreya. 👍🏻

Sys: &syscall.SysProcAttr{
Setsid: true,
},
Files: []uintptr{
uintptr(syscall.Stdin),
uintptr(syscall.Stdout),
uintptr(syscall.Stderr),
},
}

//#nosec G204
pid, _ := syscall.ForkExec(args[0], args, pattrs)

var wstatus syscall.WaitStatus
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
for xerrors.Is(err, syscall.EINTR) {
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
}

return nil
}
19 changes: 19 additions & 0 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"os"
"path/filepath"
"runtime"
"time"

"cloud.google.com/go/compute/metadata"
Expand All @@ -17,6 +18,7 @@ import (
"cdr.dev/slog/sloggers/sloghuman"

"github.com/coder/coder/agent"
"github.com/coder/coder/agent/reaper"
"github.com/coder/coder/cli/cliflag"
"github.com/coder/coder/codersdk"
"github.com/coder/retry"
Expand Down Expand Up @@ -50,6 +52,23 @@ func workspaceAgent() *cobra.Command {
}
defer logWriter.Close()
logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug)

isLinux := runtime.GOOS == "linux"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to limit this to linux only? I suppose this should work on Darwin and other BSD flavors too?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was causing things to fail when it ran on macos. I don't think limiting it to linux is going to be a big deal because the reaper is primarily going to be used for docker containers and on mac that's going to be run in a linux vm anyway.


// Spawn a reaper so that we don't accumulate a ton
// of zombie processes.
if reaper.IsInitProcess() && !reaper.IsChild() && isLinux {
logger.Info(cmd.Context(), "spawning reaper process")
err := reaper.ForkReap(nil)
if err != nil {
logger.Error(cmd.Context(), "failed to reap", slog.Error(err))
return xerrors.Errorf("fork reap: %w", err)
}

logger.Info(cmd.Context(), "reaper process exiting")
return nil
}

client := codersdk.New(coderURL)

if pprofEnabled {
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ require (
github.com/golang-migrate/migrate/v4 v4.15.2
github.com/google/go-github/v43 v43.0.1-0.20220414155304-00e42332e405
github.com/google/uuid v1.3.0
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b
github.com/hashicorp/go-version v1.5.0
github.com/hashicorp/hc-install v0.3.2
github.com/hashicorp/hcl/v2 v2.12.0
Expand Down Expand Up @@ -134,8 +135,6 @@ require (
storj.io/drpc v0.0.30
)

require github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect

require (
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
github.com/Microsoft/go-winio v0.5.2 // indirect
Expand Down Expand Up @@ -236,6 +235,7 @@ require (
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
github.com/yashtewari/glob-intersection v0.1.0 // indirect
github.com/yuin/goldmark v1.4.12 // indirect
github.com/zclconf/go-cty v1.10.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,8 @@ github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHh
github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b h1:3GrpnZQBxcMj1gCXQLelfjCT1D5MPGTuGMKHVzSIH6A=
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b/go.mod h1:qIFzeFcJU3OIFk/7JreWXcUjFmcCaeHTH9KoNyHYVCs=
github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
Expand Down