diff --git a/internal/process/defunct_processes.go b/internal/process/defunct_processes.go index d8873206eb1..c6598cff765 100644 --- a/internal/process/defunct_processes.go +++ b/internal/process/defunct_processes.go @@ -16,50 +16,60 @@ const ProcessFS = "/proc" // Stat represents status information of a process from /proc/[pid]/stat. type Stat struct { + // Pid is the PID of the process + Pid int + // Comm is the command name (usually the executable filename). Comm string // State is the state of the process. State string + + // PPid is the parent PID of the process + PPid int } -// DefunctProcesses returns the number of zombie processes in the node. -func DefunctProcesses() (defunctCount uint, retErr error) { - return DefunctProcessesForPath(ProcessFS) +// ParseDefunctProcesses returns the number of defunct processes on the node, +// as well as the number of defunct children of the current running process. +func ParseDefunctProcesses() (defunctCount uint, defunctChildren []int, retErr error) { + return ParseDefunctProcessesForPathAndParent(ProcessFS, os.Getpid()) } -// DefunctProcessesForPath retrieves the number of zombie processes from -// a specific process filesystem. -func DefunctProcessesForPath(path string) (defunctCount uint, retErr error) { +// ParseDefunctProcessesForPath retrieves the number of zombie processes from +// a specific process filesystem, as well as the number of defunct children of a given parent. +func ParseDefunctProcessesForPathAndParent(path string, parent int) (defunctCount uint, defunctChildren []int, retErr error) { directories, err := os.Open(path) if err != nil { - return 0, err + return 0, defunctChildren, err } defer directories.Close() names, err := directories.Readdirnames(-1) if err != nil { - return 0, err + return 0, defunctChildren, err } for _, name := range names { // Processes have numeric names. If the name cannot // be parsed to an int, it is not a process name. - if _, err := strconv.ParseInt(name, 10, 0); err != nil { + pid, err := strconv.ParseInt(name, 10, 0) + if err != nil { continue } stat, err := processStats(path, name) if err != nil { - logrus.Debugf("Failed to get the status of process with PID %s: %v", name, err) continue } if stat.State == "Z" { - logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) defunctCount++ + logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) + if stat.PPid == parent { + defunctChildren = append(defunctChildren, int(pid)) + } } } - return defunctCount, nil + return defunctCount, defunctChildren, nil } // processStats returns status information of a process as defined in /proc/[pid]/stat @@ -73,21 +83,34 @@ func processStats(fsPath, pid string) (*Stat, error) { // /proc/[PID]/stat format is described in proc(5). The second field is process name, // enclosed in parentheses, and it can contain parentheses inside. No other fields // can have parentheses, so look for the last ')'. - i := strings.LastIndexByte(data, ')') - if i <= 2 || i >= len(data)-1 { + commEnd := strings.LastIndexByte(data, ')') + if commEnd <= 2 || commEnd >= len(data)-1 { return nil, errors.Errorf("invalid stat data (no comm): %q", data) } - parts := strings.SplitN(data[:i], " (", 2) + parts := strings.SplitN(data[:commEnd], " (", 2) if len(parts) != 2 { return nil, errors.Errorf("invalid stat data (no comm): %q", data) } + stateIdx := commEnd + 2 + + // the fourth field is PPid, and we can start looking after the space after State + ppidBegin := stateIdx + 2 + ppidEnd := strings.IndexByte(data[ppidBegin:], ' ') + + ppid, err := strconv.ParseInt(data[ppidBegin:ppidBegin+ppidEnd], 10, 0) + if err != nil { + return nil, errors.Errorf("invalid stat data (invalid ppid): %q", data[stateIdx+2:ppidEnd]) + } + return &Stat{ // The command name is field 2. Comm: parts[1], // The state is field 3, which is the first two fields and a space after. - State: string(data[i+2]), + State: string(data[stateIdx]), + + PPid: int(ppid), }, nil } diff --git a/internal/process/defunct_processes_test.go b/internal/process/defunct_processes_test.go index 064c55cf0aa..a68ecbf7ccd 100644 --- a/internal/process/defunct_processes_test.go +++ b/internal/process/defunct_processes_test.go @@ -11,28 +11,34 @@ import ( // The actual test suite var _ = t.Describe("Process", func() { - t.Describe("DefunctProcessesForPath", func() { + t.Describe("ParseDefunctProcessesForPathAndParent", func() { Context("Should succeed", func() { It("when given a valid path name and there are defunct processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_1") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_1", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(7))) }) + It("to get children when given a valid path name and there are defunct processes", func() { + _, children, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_1", 1) + + Expect(err).To(BeNil()) + Expect(len(children)).To(Equal(2)) + }) It("when given a valid path name but there are no defunct processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_2") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_2", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) }) It("when given a valid path name but there are no processes", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_3") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_3", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) }) It("when given a valid path name but there are no directories", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_success_4") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_success_4", 0) Expect(err).To(BeNil()) Expect(defunctCount).To(Equal(uint(0))) @@ -40,14 +46,14 @@ var _ = t.Describe("Process", func() { }) Context("Should fail", func() { It("when given an invalid path name", func() { - defunctCount, err := process.DefunctProcessesForPath("./test/proc") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./test/proc", 0) formattedErr := fmt.Sprintf("%v", err) Expect(formattedErr).To(Equal("open ./test/proc: no such file or directory")) Expect(defunctCount).To(Equal(uint(0))) }) It("when the given path name does not belong to a directory", func() { - defunctCount, err := process.DefunctProcessesForPath("./testing/proc_fail") + defunctCount, _, err := process.ParseDefunctProcessesForPathAndParent("./testing/proc_fail", 0) formattedErr := fmt.Sprintf("%v", err) Expect(formattedErr).To(Equal("readdirent ./testing/proc_fail: not a directory")) diff --git a/internal/process/zombie_monitor.go b/internal/process/zombie_monitor.go new file mode 100644 index 00000000000..d09bdab2057 --- /dev/null +++ b/internal/process/zombie_monitor.go @@ -0,0 +1,53 @@ +package process + +import ( + "syscall" + "time" + + "github.com/sirupsen/logrus" +) + +// ZombieMonitor is a structure for watching and cleaning up zombies on the node. +// It is responsible for cleaning up zombies that are children of the currently running process. +// It does so by occasionally polling for the zombie processes. +// If any zombies are found, there is a delay between when they're identified and when they're cleaned. +// This is to ensure ZombieMonitor doesn't interfere with the go runtime's own child management. +type ZombieMonitor struct { + closeChan chan struct{} +} + +// NewZombieMonitor creates and starts the zombie monitor. +func NewZombieMonitor() *ZombieMonitor { + monitor := &ZombieMonitor{ + closeChan: make(chan struct{}, 1), + } + go monitor.Start() + return monitor +} + +// Shutdown instructs the zombie monitor to stop listening and exit. +func (zm *ZombieMonitor) Shutdown() { + zm.closeChan <- struct{}{} +} + +// Start begins the zombie monitor. It will populate the zombie count, +// as well as begin the zombie cleaning process. +func (zm *ZombieMonitor) Start() { + for { + _, zombieChildren, err := ParseDefunctProcesses() + if err != nil { + logrus.Warnf("Failed to get defunct process information: %v", err) + } + select { + case <-zm.closeChan: + // Since the process will soon shutdown, and its children will be reparented, no need to delay the shutdown to cleanup. + return + case <-time.After(defaultZombieChildReapPeriod): + } + for _, child := range zombieChildren { + if _, err := syscall.Wait4(child, nil, syscall.WNOHANG, nil); err != nil { + logrus.Errorf("Failed to reap child process %d: %v", child, err) + } + } + } +} diff --git a/internal/process/zombie_monitor_defaults.go b/internal/process/zombie_monitor_defaults.go new file mode 100644 index 00000000000..f07da2a187a --- /dev/null +++ b/internal/process/zombie_monitor_defaults.go @@ -0,0 +1,10 @@ +//go:build !test +// +build !test + +package process + +import "time" + +// defaultZombieChildReapPeriod is the period +// the zombie monitor will reap defunct children +var defaultZombieChildReapPeriod = time.Minute * 5 diff --git a/internal/process/zombie_monitor_test.go b/internal/process/zombie_monitor_test.go new file mode 100644 index 00000000000..9e4aad41364 --- /dev/null +++ b/internal/process/zombie_monitor_test.go @@ -0,0 +1,38 @@ +package process_test + +import ( + "os/exec" + "time" + + "github.com/cri-o/cri-o/internal/process" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +// The actual test suite +var _ = t.Describe("ZombieMonitor", func() { + It("should clean zombie", func() { + cmd := createZombie() + defer cmd.Wait() // nolint:errcheck + + monitor := process.NewZombieMonitor() + defer monitor.Shutdown() + + Eventually(func() int { + _, defunctChildren, err := process.ParseDefunctProcesses() + Expect(err).To(BeNil()) + return len(defunctChildren) + }, time.Second*10, time.Second).Should(Equal(0)) + }) +}) + +func createZombie() *exec.Cmd { + cmd := exec.Command("true") + err := cmd.Start() + Expect(err).To(BeNil()) + + _, defunctChildren, err := process.ParseDefunctProcesses() + Expect(err).To(BeNil()) + Expect(len(defunctChildren)).To(Equal(1)) + return cmd +} diff --git a/internal/process/zombie_monitor_test_inject.go b/internal/process/zombie_monitor_test_inject.go new file mode 100644 index 00000000000..e829e2d81fd --- /dev/null +++ b/internal/process/zombie_monitor_test_inject.go @@ -0,0 +1,13 @@ +//go:build test +// +build test + +// All *_inject.go files are meant to be used by tests only. Purpose of this +// files is to provide a way to inject mocked data into the current setup. + +package process + +import "time" + +// defaultZombieChildReapPeriod reduces the time waited to reap the children +// for testing purposes +var defaultZombieChildReapPeriod = time.Second * 5 diff --git a/server/metrics/metrics.go b/server/metrics/metrics.go index bd2d7330833..ff854771177 100644 --- a/server/metrics/metrics.go +++ b/server/metrics/metrics.go @@ -176,12 +176,8 @@ func New(config *libconfig.MetricsConfig) *Metrics { Help: "Total number of defunct processes in the node", }, func() float64 { - total, err := process.DefunctProcesses() - if err == nil { - return float64(total) - } - logrus.Warn(err) - return 0 + count, _, _ := process.ParseDefunctProcesses() // nolint:errcheck + return float64(count) }, ), } diff --git a/server/server.go b/server/server.go index b74c1524833..6386f16ed35 100644 --- a/server/server.go +++ b/server/server.go @@ -24,6 +24,7 @@ import ( "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" "github.com/cri-o/cri-o/internal/oci" + "github.com/cri-o/cri-o/internal/process" "github.com/cri-o/cri-o/internal/resourcestore" "github.com/cri-o/cri-o/internal/runtimehandlerhooks" "github.com/cri-o/cri-o/internal/storage" @@ -74,6 +75,8 @@ type Server struct { pullOperationsLock sync.Mutex resourceStore *resourcestore.ResourceStore + + zombieMonitor *process.ZombieMonitor } // pullArguments are used to identify a pullOperation via an input image name and @@ -297,6 +300,7 @@ func (s *Server) restore(ctx context.Context) []string { // Shutdown attempts to shut down the server's storage cleanly func (s *Server) Shutdown(ctx context.Context) error { s.resourceStore.Close() + s.zombieMonitor.Shutdown() if err := s.ContainerServer.Shutdown(); err != nil { return err @@ -507,6 +511,8 @@ func New( logrus.Debug("Metrics are disabled") } + s.zombieMonitor = process.NewZombieMonitor() + return s, nil }