From 4bdcabb50dcef955fd863576fbde1788cee3ad02 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 23 Aug 2021 14:02:41 -0400 Subject: [PATCH] internal/process: add functionality to clean up zombie children Now that exec sync requests are run by conmon, there are more processes in the mix and more possibility for zombies this commit adds a zombie monitor to the defunct process metrics collection flow. it is a little clunky, but it would be weird to have two different /proc parsers for very similar uses Signed-off-by: Peter Hunt --- internal/process/defunct_processes.go | 55 +++++++++++++------ internal/process/defunct_processes_test.go | 20 ++++--- internal/process/zombie_monitor.go | 53 ++++++++++++++++++ internal/process/zombie_monitor_defaults.go | 10 ++++ internal/process/zombie_monitor_test.go | 38 +++++++++++++ .../process/zombie_monitor_test_inject.go | 13 +++++ server/metrics/metrics.go | 8 +-- server/server.go | 6 ++ 8 files changed, 174 insertions(+), 29 deletions(-) create mode 100644 internal/process/zombie_monitor.go create mode 100644 internal/process/zombie_monitor_defaults.go create mode 100644 internal/process/zombie_monitor_test.go create mode 100644 internal/process/zombie_monitor_test_inject.go diff --git a/internal/process/defunct_processes.go b/internal/process/defunct_processes.go index d8873206eb1..c6598cff765 100644 --- a/internal/process/defunct_processes.go +++ b/internal/process/defunct_processes.go @@ -16,50 +16,60 @@ const ProcessFS = "/proc" // Stat represents status information of a process from /proc/[pid]/stat. type Stat struct { + // Pid is the PID of the process + Pid int + // Comm is the command name (usually the executable filename). Comm string // State is the state of the process. State string + + // PPid is the parent PID of the process + PPid int } -// DefunctProcesses returns the number of zombie processes in the node. -func DefunctProcesses() (defunctCount uint, retErr error) { - return DefunctProcessesForPath(ProcessFS) +// ParseDefunctProcesses returns the number of defunct processes on the node, +// as well as the number of defunct children of the current running process. +func ParseDefunctProcesses() (defunctCount uint, defunctChildren []int, retErr error) { + return ParseDefunctProcessesForPathAndParent(ProcessFS, os.Getpid()) } -// DefunctProcessesForPath retrieves the number of zombie processes from -// a specific process filesystem. -func DefunctProcessesForPath(path string) (defunctCount uint, retErr error) { +// ParseDefunctProcessesForPath retrieves the number of zombie processes from +// a specific process filesystem, as well as the number of defunct children of a given parent. +func ParseDefunctProcessesForPathAndParent(path string, parent int) (defunctCount uint, defunctChildren []int, retErr error) { directories, err := os.Open(path) if err != nil { - return 0, err + return 0, defunctChildren, err } defer directories.Close() names, err := directories.Readdirnames(-1) if err != nil { - return 0, err + return 0, defunctChildren, err } for _, name := range names { // Processes have numeric names. If the name cannot // be parsed to an int, it is not a process name. - if _, err := strconv.ParseInt(name, 10, 0); err != nil { + pid, err := strconv.ParseInt(name, 10, 0) + if err != nil { continue } stat, err := processStats(path, name) if err != nil { - logrus.Debugf("Failed to get the status of process with PID %s: %v", name, err) continue } if stat.State == "Z" { - logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) defunctCount++ + logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) + if stat.PPid == parent { + defunctChildren = append(defunctChildren, int(pid)) + } } } - return defunctCount, nil + return defunctCount, defunctChildren, nil } // processStats returns status information of a process as defined in /proc/[pid]/stat @@ -73,21 +83,34 @@ func processStats(fsPath, pid string) (*Stat, error) { // /proc/[PID]/stat format is described in proc(5). The second field is process name, // enclosed in parentheses, and it can contain parentheses inside. No other fields // can have parentheses, so look for the last ')'. - i := strings.LastIndexByte(data, ')') - if i <= 2 || i >= len(data)-1 { + commEnd := strings.LastIndexByte(data, ')') + if commEnd <= 2 || commEnd >= len(data)-1 { return nil, errors.Errorf("invalid stat data (no comm): %q", data) } - parts := strings.SplitN(data[:i], " (", 2) + parts := strings.SplitN(data[:commEnd], " (", 2) if len(parts) != 2 { return nil, errors.Errorf("invalid stat data (no comm): %q", data) } + stateIdx := commEnd + 2 + + // the fourth field is PPid, and we can start looking after the space after State + ppidBegin := stateIdx + 2 + ppidEnd := strings.IndexByte(data[ppidBegin:], ' ') + + ppid, err := strconv.ParseInt(data[ppidBegin:ppidBegin+ppidEnd], 10, 0) + if err != nil { + return nil, errors.Errorf("invalid stat data (invalid ppid): %q", data[stateIdx+2:ppidEnd]) + } + return &Stat{ // The command name is field 2. Comm: parts[1], // The state is field 3, which is the first two fields and a space after. - State: string(data[i+2]), + State: string(data[stateIdx]), + + PPid: int(ppid), }, nil } diff --git a/internal/process/defunct_processes_test.go b/internal/process/defunct_processes_test.go index 064c55cf0aa..a68ecbf7ccd 100644 --- a/internal/process/defunct_processes_test.go +++ b/internal/process/defunct_processes_test.go @@ -11,28 +11,34 @@ import ( // The actual test suite var _ = t.Describe("Process", func() { - t.Describe("DefunctProcessesForPath", func() { + t.Describe("ParseDefunctProcessesForPathAndParent", func() { Context("Should succeed", func() { It("when given a valid path name and there are defunct processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_1") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_1", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(7))) }) + It("to get children when given a valid path name and there are defunct processes", func() { + _, children, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_1", 1) + + Expect(err).To(BeNil()) + Expect(len(children)).To(Equal(2)) + }) It("when given a valid path name but there are no defunct processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_2") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_2", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) }) It("when given a valid path name but there are no processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_3") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_3", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) }) It("when given a valid path name but there are no directories", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_4") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_4", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) @@ -40,14 +46,14 @@ var _ = t.Describe("Process", func() { }) Context("Should fail", func() { It("when given an invalid path name", func() { - defunctCount, err := process.DefunctProcessesForPath("./test/proc") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./test/proc", 0) formattedErr := fmt.Sprintf("%v", err) Expect(formattedErr).To(Equal("open ./test/proc: no such file or directory")) Expect(defunctCount).To(Equal(uint(0))) }) It("when the given path name does not belong to a directory", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_fail") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_fail", 0) formattedErr := fmt.Sprintf("%v", err) Expect(formattedErr).To(Equal("readdirent ./testing/proc_fail: not a directory")) diff --git a/internal/process/zombie_monitor.go b/internal/process/zombie_monitor.go new file mode 100644 index 00000000000..d09bdab2057 --- /dev/null +++ b/internal/process/zombie_monitor.go @@ -0,0 +1,53 @@ +package process + +import ( + "syscall" + "time" + + "github.com/sirupsen/logrus" +) + +// ZombieMonitor is a structure for watching and cleaning up zombies on the node. +// It is responsible for cleaning up zombies that are children of the currently running process. +// It does so by occasionally polling for the zombie processes. +// If any zombies are found, there is a delay between when they're identified and when they're cleaned. +// This is to ensure ZombieMonitor doesn't interfere with the go runtime's own child management. +type ZombieMonitor struct { + closeChan chan struct{} +} + +// NewZombieMonitor creates and starts the zombie monitor. +func NewZombieMonitor() *ZombieMonitor { + monitor := &ZombieMonitor{ + closeChan: make(chan struct{}, 1), + } + go monitor.Start() + return monitor +} + +// Shutdown instructs the zombie monitor to stop listening and exit. +func (zm *ZombieMonitor) Shutdown() { + zm.closeChan <- struct{}{} +} + +// Start begins the zombie monitor. It will populate the zombie count, +// as well as begin the zombie cleaning process. +func (zm *ZombieMonitor) Start() { + for { + _, zombieChildren, err := ParseDefunctProcesses() + if err != nil { + logrus.Warnf("Failed to get defunct process information: %v", err) + } + select { + case <-zm.closeChan: + // Since the process will soon shutdown, and its children will be reparented, no need to delay the shutdown to cleanup. + return + case <-time.After(defaultZombieChildReapPeriod): + } + for _, child := range zombieChildren { + if _, err := syscall.Wait4(child, nil, syscall.WNOHANG, nil); err != nil { + logrus.Errorf("Failed to reap child process %d: %v", child, err) + } + } + } +} diff --git a/internal/process/zombie_monitor_defaults.go b/internal/process/zombie_monitor_defaults.go new file mode 100644 index 00000000000..f07da2a187a --- /dev/null +++ b/internal/process/zombie_monitor_defaults.go @@ -0,0 +1,10 @@ +//go:build !test +// +build !test + +package process + +import "time" + +// defaultZombieChildReapPeriod is the period +// the zombie monitor will reap defunct children +var defaultZombieChildReapPeriod = time.Minute * 5 diff --git a/internal/process/zombie_monitor_test.go b/internal/process/zombie_monitor_test.go new file mode 100644 index 00000000000..9e4aad41364 --- /dev/null +++ b/internal/process/zombie_monitor_test.go @@ -0,0 +1,38 @@ +package process_test + +import ( + "os/exec" + "time" + + "github.com/cri-o/cri-o/internal/process" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +// The actual test suite +var _ = t.Describe("ZombieMonitor", func() { + It("should clean zombie", func() { + cmd := createZombie() + defer cmd.Wait() // nolint:errcheck + + monitor := process.NewZombieMonitor() + defer monitor.Shutdown() + + Eventually(func() int { + _, defunctChildren, err := process.ParseDefunctProcesses() + Expect(err).To(BeNil()) + return len(defunctChildren) + }, time.Second*10, time.Second).Should(Equal(0)) + }) +}) + +func createZombie() *exec.Cmd { + cmd := exec.Command("true") + err := cmd.Start() + Expect(err).To(BeNil()) + + _, defunctChildren, err := process.ParseDefunctProcesses() + Expect(err).To(BeNil()) + Expect(len(defunctChildren)).To(Equal(1)) + return cmd +} diff --git a/internal/process/zombie_monitor_test_inject.go b/internal/process/zombie_monitor_test_inject.go new file mode 100644 index 00000000000..e829e2d81fd --- /dev/null +++ b/internal/process/zombie_monitor_test_inject.go @@ -0,0 +1,13 @@ +//go:build test +// +build test + +// All *_inject.go files are meant to be used by tests only. Purpose of this +// files is to provide a way to inject mocked data into the current setup. + +package process + +import "time" + +// defaultZombieChildReapPeriod reduces the time waited to reap the children +// for testing purposes +var defaultZombieChildReapPeriod = time.Second * 5 diff --git a/server/metrics/metrics.go b/server/metrics/metrics.go index bd2d7330833..ff854771177 100644 --- a/server/metrics/metrics.go +++ b/server/metrics/metrics.go @@ -176,12 +176,8 @@ func New(config *libconfig.MetricsConfig) *Metrics { Help: "Total number of defunct processes in the node", }, func() float64 { - total, err := process.DefunctProcesses() - if err == nil { - return float64(total) - } - logrus.Warn(err) - return 0 + count, _, _ := process.ParseDefunctProcesses() // nolint:errcheck + return float64(count) }, ), } diff --git a/server/server.go b/server/server.go index b74c1524833..6386f16ed35 100644 --- a/server/server.go +++ b/server/server.go @@ -24,6 +24,7 @@ import ( "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" "github.com/cri-o/cri-o/internal/oci" + "github.com/cri-o/cri-o/internal/process" "github.com/cri-o/cri-o/internal/resourcestore" "github.com/cri-o/cri-o/internal/runtimehandlerhooks" "github.com/cri-o/cri-o/internal/storage" @@ -74,6 +75,8 @@ type Server struct { pullOperationsLock sync.Mutex resourceStore *resourcestore.ResourceStore + + zombieMonitor *process.ZombieMonitor } // pullArguments are used to identify a pullOperation via an input image name and @@ -297,6 +300,7 @@ func (s *Server) restore(ctx context.Context) []string { // Shutdown attempts to shut down the server's storage cleanly func (s *Server) Shutdown(ctx context.Context) error { s.resourceStore.Close() + s.zombieMonitor.Shutdown() if err := s.ContainerServer.Shutdown(); err != nil { return err @@ -507,6 +511,8 @@ func New( logrus.Debug("Metrics are disabled") } + s.zombieMonitor = process.NewZombieMonitor() + return s, nil }