-
Notifications
You must be signed in to change notification settings - Fork 1.1k
internal/process: add functionality to clean up zombie children #5260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,50 +16,60 @@ const ProcessFS = "/proc" | |
|
|
||
| // Stat represents status information of a process from /proc/[pid]/stat. | ||
| type Stat struct { | ||
| // Pid is the PID of the process | ||
| Pid int | ||
|
|
||
| // Comm is the command name (usually the executable filename). | ||
| Comm string | ||
|
|
||
| // State is the state of the process. | ||
| State string | ||
|
|
||
| // PPid is the parent PID of the process | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| PPid int | ||
| } | ||
|
|
||
| // DefunctProcesses returns the number of zombie processes in the node. | ||
| func DefunctProcesses() (defunctCount uint, retErr error) { | ||
| return DefunctProcessesForPath(ProcessFS) | ||
| // ParseDefunctProcesses returns the number of defunct processes on the node, | ||
| // as well as the number of defunct children of the current running process. | ||
| func ParseDefunctProcesses() (defunctCount uint, defunctChildren []int, retErr error) { | ||
| return ParseDefunctProcessesForPathAndParent(ProcessFS, os.Getpid()) | ||
| } | ||
|
|
||
| // DefunctProcessesForPath retrieves the number of zombie processes from | ||
| // a specific process filesystem. | ||
| func DefunctProcessesForPath(path string) (defunctCount uint, retErr error) { | ||
| // ParseDefunctProcessesForPath retrieves the number of zombie processes from | ||
kolyshkin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // a specific process filesystem, as well as the number of defunct children of a given parent. | ||
| func ParseDefunctProcessesForPathAndParent(path string, parent int) (defunctCount uint, defunctChildren []int, retErr error) { | ||
| directories, err := os.Open(path) | ||
| if err != nil { | ||
| return 0, err | ||
| return 0, defunctChildren, err | ||
kolyshkin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| defer directories.Close() | ||
|
|
||
| names, err := directories.Readdirnames(-1) | ||
| if err != nil { | ||
| return 0, err | ||
| return 0, defunctChildren, err | ||
| } | ||
|
|
||
| for _, name := range names { | ||
| // Processes have numeric names. If the name cannot | ||
| // be parsed to an int, it is not a process name. | ||
| if _, err := strconv.ParseInt(name, 10, 0); err != nil { | ||
| pid, err := strconv.ParseInt(name, 10, 0) | ||
| if err != nil { | ||
| continue | ||
| } | ||
|
|
||
| stat, err := processStats(path, name) | ||
| if err != nil { | ||
| logrus.Debugf("Failed to get the status of process with PID %s: %v", name, err) | ||
| continue | ||
| } | ||
| if stat.State == "Z" { | ||
| logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) | ||
| defunctCount++ | ||
| logrus.Warnf("Found defunct process with PID %s (%s)", name, stat.Comm) | ||
| if stat.PPid == parent { | ||
| defunctChildren = append(defunctChildren, int(pid)) | ||
| } | ||
| } | ||
| } | ||
| return defunctCount, nil | ||
| return defunctCount, defunctChildren, nil | ||
| } | ||
|
|
||
| // processStats returns status information of a process as defined in /proc/[pid]/stat | ||
|
|
@@ -73,21 +83,34 @@ func processStats(fsPath, pid string) (*Stat, error) { | |
| // /proc/[PID]/stat format is described in proc(5). The second field is process name, | ||
| // enclosed in parentheses, and it can contain parentheses inside. No other fields | ||
| // can have parentheses, so look for the last ')'. | ||
| i := strings.LastIndexByte(data, ')') | ||
| if i <= 2 || i >= len(data)-1 { | ||
| commEnd := strings.LastIndexByte(data, ')') | ||
| if commEnd <= 2 || commEnd >= len(data)-1 { | ||
| return nil, errors.Errorf("invalid stat data (no comm): %q", data) | ||
| } | ||
|
|
||
| parts := strings.SplitN(data[:i], " (", 2) | ||
| parts := strings.SplitN(data[:commEnd], " (", 2) | ||
| if len(parts) != 2 { | ||
| return nil, errors.Errorf("invalid stat data (no comm): %q", data) | ||
| } | ||
|
|
||
| stateIdx := commEnd + 2 | ||
|
|
||
| // the fourth field is PPid, and we can start looking after the space after State | ||
| ppidBegin := stateIdx + 2 | ||
| ppidEnd := strings.IndexByte(data[ppidBegin:], ' ') | ||
|
|
||
| ppid, err := strconv.ParseInt(data[ppidBegin:ppidBegin+ppidEnd], 10, 0) | ||
| if err != nil { | ||
| return nil, errors.Errorf("invalid stat data (invalid ppid): %q", data[stateIdx+2:ppidEnd]) | ||
| } | ||
|
|
||
| return &Stat{ | ||
| // The command name is field 2. | ||
| Comm: parts[1], | ||
|
|
||
| // The state is field 3, which is the first two fields and a space after. | ||
| State: string(data[i+2]), | ||
| State: string(data[stateIdx]), | ||
|
|
||
| PPid: int(ppid), | ||
| }, nil | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| package process | ||
|
|
||
| import ( | ||
| "syscall" | ||
| "time" | ||
|
|
||
| "github.com/sirupsen/logrus" | ||
| ) | ||
|
|
||
| // ZombieMonitor is a structure for watching and cleaning up zombies on the node. | ||
| // It is responsible for cleaning up zombies that are children of the currently running process. | ||
| // It does so by occasionally polling for the zombie processes. | ||
| // If any zombies are found, there is a delay between when they're identified and when they're cleaned. | ||
| // This is to ensure ZombieMonitor doesn't interfere with the go runtime's own child management. | ||
| type ZombieMonitor struct { | ||
| closeChan chan struct{} | ||
| } | ||
|
|
||
| // NewZombieMonitor creates and starts the zombie monitor. | ||
| func NewZombieMonitor() *ZombieMonitor { | ||
| monitor := &ZombieMonitor{ | ||
| closeChan: make(chan struct{}, 1), | ||
| } | ||
| go monitor.Start() | ||
| return monitor | ||
| } | ||
|
|
||
| // Shutdown instructs the zombie monitor to stop listening and exit. | ||
| func (zm *ZombieMonitor) Shutdown() { | ||
| zm.closeChan <- struct{}{} | ||
| } | ||
|
|
||
| // Start begins the zombie monitor. It will populate the zombie count, | ||
| // as well as begin the zombie cleaning process. | ||
| func (zm *ZombieMonitor) Start() { | ||
| for { | ||
| _, zombieChildren, err := ParseDefunctProcesses() | ||
| if err != nil { | ||
| logrus.Warnf("Failed to get defunct process information: %v", err) | ||
| } | ||
| select { | ||
| case <-zm.closeChan: | ||
| // Since the process will soon shutdown, and its children will be reparented, no need to delay the shutdown to cleanup. | ||
| return | ||
| case <-time.After(defaultZombieChildReapPeriod): | ||
| } | ||
| for _, child := range zombieChildren { | ||
| if _, err := syscall.Wait4(child, nil, syscall.WNOHANG, nil); err != nil { | ||
| logrus.Errorf("Failed to reap child process %d: %v", child, err) | ||
| } | ||
| } | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| //go:build !test | ||
| // +build !test | ||
|
|
||
| package process | ||
|
|
||
| import "time" | ||
|
|
||
| // defaultZombieChildReapPeriod is the period | ||
| // the zombie monitor will reap defunct children | ||
| var defaultZombieChildReapPeriod = time.Minute * 5 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| package process_test | ||
|
|
||
| import ( | ||
| "os/exec" | ||
| "time" | ||
|
|
||
| "github.com/cri-o/cri-o/internal/process" | ||
| . "github.com/onsi/ginkgo" | ||
| . "github.com/onsi/gomega" | ||
| ) | ||
|
|
||
| // The actual test suite | ||
| var _ = t.Describe("ZombieMonitor", func() { | ||
| It("should clean zombie", func() { | ||
| cmd := createZombie() | ||
| defer cmd.Wait() // nolint:errcheck | ||
|
|
||
| monitor := process.NewZombieMonitor() | ||
| defer monitor.Shutdown() | ||
|
|
||
| Eventually(func() int { | ||
| _, defunctChildren, err := process.ParseDefunctProcesses() | ||
| Expect(err).To(BeNil()) | ||
| return len(defunctChildren) | ||
| }, time.Second*10, time.Second).Should(Equal(0)) | ||
| }) | ||
| }) | ||
|
|
||
| func createZombie() *exec.Cmd { | ||
| cmd := exec.Command("true") | ||
| err := cmd.Start() | ||
| Expect(err).To(BeNil()) | ||
|
|
||
| _, defunctChildren, err := process.ParseDefunctProcesses() | ||
| Expect(err).To(BeNil()) | ||
| Expect(len(defunctChildren)).To(Equal(1)) | ||
| return cmd | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| //go:build test | ||
| // +build test | ||
|
|
||
| // All *_inject.go files are meant to be used by tests only. Purpose of this | ||
| // files is to provide a way to inject mocked data into the current setup. | ||
|
|
||
| package process | ||
|
|
||
| import "time" | ||
|
|
||
| // defaultZombieChildReapPeriod reduces the time waited to reap the children | ||
| // for testing purposes | ||
| var defaultZombieChildReapPeriod = time.Second * 5 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: missing period at the end of sentence (feel free to ignore).