Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,8 @@ mockgen: \
mock-ocicni-types \
mock-seccompociartifact-types \
mock-ociartifact-types \
mock-systemd
mock-systemd \
mock-cgmgr

.PHONY: mock-containereventserver
mock-containereventserver: ${MOCKGEN}
Expand Down Expand Up @@ -500,6 +501,13 @@ mock-oci: ${MOCKGEN}
-destination ${MOCK_PATH}/oci/oci.go \
github.com/cri-o/cri-o/internal/oci RuntimeImpl

.PHONY: mock-cgmgr
mock-cgmgr: ${MOCKGEN}
${MOCKGEN} \
-package cgmgr \
-destination ${MOCK_PATH}/config/cgmgr/cgmgr.go \
github.com/cri-o/cri-o/internal/config/cgmgr CgroupManager

.PHONY: mock-image-types
mock-image-types: ${MOCKGEN}
${BUILD_BIN_PATH}/mockgen \
Expand Down
85 changes: 85 additions & 0 deletions internal/config/cgmgr/cgmgr_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ type CgroupManager interface {
// It creates a new cgroup for that sandbox if it does not already exist.
// It returns the cgroup stats for that sandbox.
SandboxCgroupStats(sbParent, sbID string) (*CgroupStats, error)
// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
// The cgroupPath parameter is the container's cgroup path from spec.Linux.CgroupsPath.
// This is only supported on cgroup v2.
ExecCgroupManager(cgroupPath string) (cgroups.Manager, error)
// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
// It returns:
// - podManager: the cgroup manager for the pod cgroup
// - containerManagers: a slice of cgroup managers for the container cgroup(s).
// This may include an extra manager if crun creates a sub-cgroup of the container.
PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, err error)
}

// New creates a new CgroupManager with defaults.
Expand Down Expand Up @@ -245,3 +256,77 @@ func removeSandboxCgroup(sbParent, containerCgroup string) error {
func containerCgroupPath(id string) string {
return CrioPrefix + "-" + id
}

// LibctrManager creates a libcontainer cgroup manager for the given cgroup.
// The cgroup parameter is the name of the cgroup, parent is the parent path,
// and systemd indicates whether to use systemd cgroup driver.
func LibctrManager(cgroup, parent string, systemd bool) (cgroups.Manager, error) {
if systemd {
parent = filepath.Base(parent)
if parent == "." {
// libcontainer shorthand for root
// see https://github.com/opencontainers/runc/blob/9fffadae8/libcontainer/cgroups/systemd/common.go#L71
parent = "-.slice"
}
}

cg := &cgroups.Cgroup{
Name: cgroup,
Parent: parent,
Resources: &cgroups.Resources{
SkipDevices: true,
},
Systemd: systemd,
// If the cgroup manager is systemd, then libcontainer
// will construct the cgroup path (for scopes) as:
// ScopePrefix-Name.scope. For slices, and for cgroupfs manager,
// this will be ignored.
// See: https://github.com/opencontainers/runc/tree/main/libcontainer/cgroups/systemd/common.go:getUnitName
ScopePrefix: CrioPrefix,
}

return manager.New(cg)
}

// crunContainerCgroupManager returns the cgroup manager for the actual container cgroup.
// Some runtimes like crun create a sub-cgroup of the container to do the actual management,
// to enforce systemd's single owner rule. This function checks for and handles that case.
// If no sub-cgroup exists, it returns nil, nil.
func crunContainerCgroupManager(expectedContainerCgroup string) (cgroups.Manager, error) {
// HACK: There isn't really a better way to check if the actual container cgroup is in a child cgroup of the expected.
// We could check /proc/$pid/cgroup, but we need to be able to query this after the container exits and the process is gone.
// We know the source of this: crun creates a sub cgroup of the container to do the actual management, to enforce systemd's single
// owner rule. Thus, we need to hardcode this check.
actualContainerCgroup := filepath.Join(expectedContainerCgroup, "container")
// Choose cpuset as the cgroup to check, with little reason.
cgroupRoot := CgroupMemoryPathV2
if !node.CgroupIsV2() {
cgroupRoot += "/cpuset"
}

// Normalize the path so that we don't add duplicate prefix.
cgroupPath := filepath.Join(cgroupRoot, strings.TrimPrefix(actualContainerCgroup, cgroupRoot))
if _, err := os.Stat(cgroupPath); err != nil {
return nil, nil
}
// must be crun, make another LibctrManager. Regardless of cgroup driver, it will be treated as cgroupfs
return LibctrManager(filepath.Base(actualContainerCgroup), filepath.Dir(actualContainerCgroup), false)
}

// execCgroupManager creates an exec cgroup for placing exec processes.
// containerCgroupAbsPath is the absolute path to the container's cgroup (without /sys/fs/cgroup prefix).
// Returns the cgroup manager for the exec cgroup.
//
// The exec cgroup location depends on whether crun created a "container" child cgroup:
// - If crun's "container" child exists: exec cgroup is created under it
// - Otherwise: exec cgroup is created directly under the container cgroup
func execCgroupManager(containerCgroupAbsPath string) (cgroups.Manager, error) {
execCgroupParent := containerCgroupAbsPath

// Check if crun created a "container" child cgroup
if mgr, err := crunContainerCgroupManager(containerCgroupAbsPath); err == nil && mgr != nil {
execCgroupParent = filepath.Join(containerCgroupAbsPath, "container")
}

return LibctrManager("exec", execCgroupParent, false)
}
55 changes: 53 additions & 2 deletions internal/config/cgmgr/cgroupfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package cgmgr

import (
"errors"
"fmt"
"path"
"path/filepath"
Expand Down Expand Up @@ -79,7 +80,7 @@ func (m *CgroupfsManager) ContainerCgroupManager(sbParent, containerID string) (
return nil, err
}

cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -151,7 +152,7 @@ func (m *CgroupfsManager) SandboxCgroupManager(sbParent, sbID string) (cgroups.M
return nil, err
}

cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -266,3 +267,53 @@ func (m *CgroupfsManager) RemoveSandboxCgroup(sbParent, containerID string) erro
// https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156
return removeSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID))
}

// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
func (m *CgroupfsManager) PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, _ error) {
containerCgroupFullPath, err := m.ContainerCgroupAbsolutePath(sbParent, containerID)
if err != nil {
return nil, nil, err
}

podCgroupFullPath := filepath.Dir(containerCgroupFullPath)

podManager, err = LibctrManager(filepath.Base(podCgroupFullPath), filepath.Dir(podCgroupFullPath), false)
if err != nil {
return nil, nil, err
}

containerManager, err := LibctrManager(filepath.Base(containerCgroupFullPath), filepath.Dir(containerCgroupFullPath), false)
if err != nil {
return nil, nil, err
}

containerManagers = []cgroups.Manager{containerManager}

// crun actually does the cgroup configuration in a child of the cgroup CRI-O expects to be the container's
extraManager, err := crunContainerCgroupManager(containerCgroupFullPath)
if err != nil {
return nil, nil, err
}

if extraManager != nil {
containerManagers = append(containerManagers, extraManager)
}

return podManager, containerManagers, nil
}

// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
// For cgroupfs, the cgroupPath is a direct filesystem path.
// This is only supported on cgroup v2.
func (m *CgroupfsManager) ExecCgroupManager(cgroupPath string) (cgroups.Manager, error) {
if cgroupPath == "" {
return nil, errors.New("container cgroup path is empty")
}

if !node.CgroupIsV2() {
return nil, errors.New("exec cgroup with CgroupFD is only supported on cgroup v2")
}

return execCgroupManager(cgroupPath)
}
30 changes: 0 additions & 30 deletions internal/config/cgmgr/stats_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@ package cgmgr

import (
"math"
"path/filepath"
"syscall"
"time"

"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/manager"

"github.com/cri-o/cri-o/internal/config/node"
)
Expand Down Expand Up @@ -91,34 +89,6 @@ func MemLimitGivenSystem(cgroupLimit uint64) uint64 {
return cgroupLimit
}

func libctrManager(cgroup, parent string, systemd bool) (cgroups.Manager, error) {
if systemd {
parent = filepath.Base(parent)
if parent == "." {
// libcontainer shorthand for root
// see https://github.com/opencontainers/runc/blob/9fffadae8/libcontainer/cgroups/systemd/common.go#L71
parent = "-.slice"
}
}

cg := &cgroups.Cgroup{
Name: cgroup,
Parent: parent,
Resources: &cgroups.Resources{
SkipDevices: true,
},
Systemd: systemd,
// If the cgroup manager is systemd, then libcontainer
// will construct the cgroup path (for scopes) as:
// ScopePrefix-Name.scope. For slices, and for cgroupfs manager,
// this will be ignored.
// See: https://github.com/opencontainers/runc/tree/main/libcontainer/cgroups/systemd/common.go:getUnitName
ScopePrefix: CrioPrefix,
}

return manager.New(cg)
}

func libctrStatsToCgroupStats(stats *cgroups.Stats) *CgroupStats {
return &CgroupStats{
Memory: cgroupMemStats(&stats.MemoryStats),
Expand Down
75 changes: 73 additions & 2 deletions internal/config/cgmgr/systemd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package cgmgr

import (
"errors"
"fmt"
"path"
"path/filepath"
Expand Down Expand Up @@ -111,7 +112,7 @@ func (m *SystemdManager) ContainerCgroupManager(sbParent, containerID string) (c
return nil, err
}
// Due to a quirk of libcontainer's cgroup driver, cgroup name = containerID
cgMgr, err := libctrManager(containerID, filepath.Dir(cgPath), true)
cgMgr, err := LibctrManager(containerID, filepath.Dir(cgPath), true)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -262,7 +263,7 @@ func (m *SystemdManager) SandboxCgroupManager(sbParent, sbID string) (cgroups.Ma
return nil, err
}

cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), true)
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), true)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -365,3 +366,73 @@ func (m *SystemdManager) RemoveSandboxCgroup(sbParent, containerID string) error

return removeSandboxCgroup(expandedParent, containerCgroupPath(containerID))
}

// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
func (m *SystemdManager) PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, _ error) {
containerCgroupFullPath, err := m.ContainerCgroupAbsolutePath(sbParent, containerID)
if err != nil {
return nil, nil, err
}

podCgroupFullPath := filepath.Dir(containerCgroupFullPath)

podManager, err = LibctrManager(filepath.Base(podCgroupFullPath), filepath.Dir(podCgroupFullPath), true)
if err != nil {
return nil, nil, err
}

// The first argument should be container ID, otherwise it adds duplicate prefix/suffix.
containerManager, err := LibctrManager(containerID, filepath.Dir(containerCgroupFullPath), true)
if err != nil {
return nil, nil, err
}

containerManagers = []cgroups.Manager{containerManager}

// crun actually does the cgroup configuration in a child of the cgroup CRI-O expects to be the container's
extraManager, err := crunContainerCgroupManager(containerCgroupFullPath)
if err != nil {
return nil, nil, err
}

if extraManager != nil {
containerManagers = append(containerManagers, extraManager)
}

return podManager, containerManagers, nil
}

// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
// For systemd, the cgroupPath is in the format "slice:prefix:containerID".
// This is only supported on cgroup v2.
func (m *SystemdManager) ExecCgroupManager(cgroupPath string) (cgroups.Manager, error) {
if cgroupPath == "" {
return nil, errors.New("container cgroup path is empty")
}

if !node.CgroupIsV2() {
return nil, errors.New("exec cgroup with CgroupFD is only supported on cgroup v2")
}

// Parse systemd format: slice:prefix:containerID
parts := strings.Split(cgroupPath, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("invalid systemd cgroup path format: %s (expected slice:prefix:containerID)", cgroupPath)
}

slice := parts[0]
prefix := parts[1]
containerID := parts[2]

expandedSlice, err := systemd.ExpandSlice(slice)
if err != nil {
return nil, fmt.Errorf("failed to expand systemd slice %q: %w", slice, err)
}

// The container cgroup is a scope under the expanded slice
// Format: <expanded-slice>/<prefix>-<containerID>.scope
containerCgroupAbsPath := filepath.Join(expandedSlice, prefix+"-"+containerID+".scope")

return execCgroupManager(containerCgroupAbsPath)
}
14 changes: 14 additions & 0 deletions internal/oci/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ type Container struct {
// To avoid race condition, it must be used with monitorProcessLock.
monitorProcess *os.Process
monitorProcessLock sync.Mutex
// execCgroupPath is the absolute path to the pre-created exec cgroup.
// When set, the exec process will spawn on this cgroup.
// If this is used, InfraCtrCPUSet will be ignored for the exec operation.
execCgroupPath string
}

func (c *Container) CRIAttributes() *types.ContainerAttributes {
Expand Down Expand Up @@ -921,6 +925,16 @@ func (c *Container) RuntimeUser() *types.ContainerUser {
return c.runtimeUser
}

// SetExecCgroupPath sets the pre-created exec cgroup path.
func (c *Container) SetExecCgroupPath(path string) {
c.execCgroupPath = path
}

// ExecCgroupPath returns the pre-created exec cgroup path, or empty string if not set.
func (c *Container) ExecCgroupPath() string {
return c.execCgroupPath
}

// SetMonitorProcess loads the container monitor process from the ContainerMonitorProcess field.
// It doesn't return any error so that we can continue to load the container even if the monitor process
// is not found.
Expand Down
5 changes: 5 additions & 0 deletions internal/oci/oci_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package oci
import (
"context"
"os"
"os/exec"
"syscall"

types "k8s.io/cri-api/pkg/apis/runtime/v1"
Expand Down Expand Up @@ -40,3 +41,7 @@ func (c *Container) SetSeccompProfilePath(pp string) {
func (c *Container) SeccompProfilePath() string {
return ""
}

// setSysProcAttr is a no-op on non-Linux platforms.
func setSysProcAttr(_ *exec.Cmd, _ uintptr) {
}
Loading
Loading