From d4b5d782a9f4ad9193edc39a1be1268a6b1a14b8 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Fri, 28 Apr 2023 16:22:46 -0400 Subject: [PATCH 1/3] cgmgr: create cgroups for systemd cgroup driver for dropped infra pods The history here is a bit convoluted. Originally, runc created the cgroup for the infra container. cAdvisor was built to assume the cgroup for the infra container would be created, and it uses this to find the network metrics for the pod. When we dropped the infra container, cri-o needed to make this cgroup so cAdvisor could still find the network metrics. However, systemd didn't like the way we did it, and would remove the cgroup mid pod creation, which was fixed in https://github.com/cri-o/cri-o/pull/6196. This actually caused the cgroup to not be created at all, which then caused the networking metrics to not be gathered at all. Thus, we do need to create a cgroup underneath the systemd cgroup. Attempt to use a slice for this, as systemd won't require a process be underneath it. Signed-off-by: Peter Hunt --- internal/config/cgmgr/cgmgr.go | 43 +++++++++++++++++++++++++++++++ internal/config/cgmgr/cgroupfs.go | 24 ++++++++--------- internal/config/cgmgr/systemd.go | 37 +++++++++++++++++++++++--- server/sandbox_remove.go | 5 ++++ test/cgroups.bats | 22 ++++++++++++++++ 5 files changed, 115 insertions(+), 16 deletions(-) diff --git a/internal/config/cgmgr/cgmgr.go b/internal/config/cgmgr/cgmgr.go index b650082c252..e5e7675f215 100644 --- a/internal/config/cgmgr/cgmgr.go +++ b/internal/config/cgmgr/cgmgr.go @@ -13,6 +13,8 @@ import ( "github.com/cri-o/cri-o/internal/config/node" libctr "github.com/opencontainers/runc/libcontainer/cgroups" + libctrCgMgr "github.com/opencontainers/runc/libcontainer/cgroups/manager" + cgcfgs "github.com/opencontainers/runc/libcontainer/configs" rspec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -74,6 +76,9 @@ type CgroupManager interface { // CreateSandboxCgroup takes the sandbox parent, and sandbox ID. // It creates a new cgroup for that sandbox, which is useful when spoofing an infra container. CreateSandboxCgroup(sbParent, containerID string) error + // RemoveSandboxCgroup takes the sandbox parent, and sandbox ID. + // It removes the cgroup for that sandbox, which is useful when spoofing an infra container. + RemoveSandboxCgroup(sbParent, containerID string) error } // New creates a new CgroupManager with defaults @@ -161,3 +166,41 @@ func MoveProcessToContainerCgroup(containerPid, commandPid int) error { } return nil } + +// createSandboxCgroup takes the path of the sandbox parent and the desired containerCgroup +// It creates a cgroup through cgroupfs (as opposed to systemd) at the location cgroupRoot/sbParent/containerCgroup. +func createSandboxCgroup(sbParent, containerCgroup string) error { + cg := &cgcfgs.Cgroup{ + Name: containerCgroup, + Parent: sbParent, + Resources: &cgcfgs.Resources{ + SkipDevices: true, + }, + } + mgr, err := libctrCgMgr.New(cg) + if err != nil { + return err + } + + return mgr.Apply(-1) +} + +func removeSandboxCgroup(sbParent, containerCgroup string) error { + cg := &cgcfgs.Cgroup{ + Name: containerCgroup, + Parent: sbParent, + Resources: &cgcfgs.Resources{ + SkipDevices: true, + }, + } + mgr, err := libctrCgMgr.New(cg) + if err != nil { + return err + } + + return mgr.Destroy() +} + +func containerCgroupPath(id string) string { + return crioPrefix + "-" + id +} diff --git a/internal/config/cgmgr/cgroupfs.go b/internal/config/cgmgr/cgroupfs.go index 7f7ed393e64..8b63b79f840 100644 --- a/internal/config/cgmgr/cgroupfs.go +++ b/internal/config/cgmgr/cgroupfs.go @@ -180,18 +180,18 @@ func setWorkloadSettings(cgPath string, resources *rspec.LinuxResources) error { return mgr.Set(cg.Resources) } -// createSandboxCgroup takes the sandbox parent, and sandbox ID. -// It creates a new cgroup for that sandbox, which is useful when spoofing an infra container. -func createSandboxCgroup(sbParent, containerID string, mgr CgroupManager) error { - cgroupAbsolutePath, err := mgr.ContainerCgroupAbsolutePath(sbParent, containerID) - if err != nil { - return err - } - _, err = cgroups.New(cgroupAbsolutePath, &rspec.LinuxResources{}) - return err -} - // CreateSandboxCgroup calls the helper function createSandboxCgroup for this manager. func (m *CgroupfsManager) CreateSandboxCgroup(sbParent, containerID string) error { - return createSandboxCgroup(sbParent, containerID, m) + // prepend "/" to sbParent so the fs driver interprets it as an absolute path + // and the cgroup isn't created as a relative path to the cgroups of the CRI-O process. + // https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156 + return createSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID)) +} + +// RemoveSandboxCgroup calls the helper function removeSandboxCgroup for this manager. +func (m *CgroupfsManager) RemoveSandboxCgroup(sbParent, containerID string) error { + // prepend "/" to sbParent so the fs driver interprets it as an absolute path + // and the cgroup isn't created as a relative path to the cgroups of the CRI-O process. + // https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156 + return removeSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID)) } diff --git a/internal/config/cgmgr/systemd.go b/internal/config/cgmgr/systemd.go index 3c55280fe16..c0ec1ab9d19 100644 --- a/internal/config/cgmgr/systemd.go +++ b/internal/config/cgmgr/systemd.go @@ -89,7 +89,7 @@ func (m *SystemdManager) ContainerCgroupAbsolutePath(sbParent, containerID strin return "", errors.Wrapf(err, "error expanding systemd slice to get container %s stats", containerID) } - return filepath.Join(cgroup, crioPrefix+"-"+containerID+".scope"), nil + return filepath.Join(cgroup, containerCgroupPath(containerID)+".scope"), nil } // MoveConmonToCgroup takes the container ID, cgroup parent, conmon's cgroup (from the config) and conmon's PID @@ -200,8 +200,37 @@ func convertCgroupFsNameToSystemd(cgroupfsName string) string { } // CreateSandboxCgroup calls the helper function createSandboxCgroup for this manager. +// Note: createSandboxCgroup will create a cgroupfs cgroup for the infra container underneath the pod slice. +// It will not use dbus to create this cgroup, but instead call libcontainer's cgroupfs manager directly. +// This is because a scope created here will not have a process within it (as it's usually for a dropped infra container), +// and a slice cannot have the required `crio` prefix (while still being within the pod slice). +// Ultimately, this cgroup is required for cAdvisor to be able to register the pod and collect network metrics for it. +// This work will not be relevant when CRI-O is responsible for gathering pod metrics (KEP-2371), but is required until that's done. func (m *SystemdManager) CreateSandboxCgroup(sbParent, containerID string) error { - // If we are running systemd as cgroup driver then we would rely on - // systemd to create cgroups for us, there's nothing to do here in this case - return nil + // sbParent should always be specified by kubelet, but sometimes not by critest/crictl. + // Skip creation in this case. + if sbParent == "" { + logrus.Infof("Not creating sandbox cgroup: sbParent is empty") + return nil + } + expandedParent, err := systemd.ExpandSlice(sbParent) + if err != nil { + return err + } + return createSandboxCgroup(expandedParent, containerCgroupPath(containerID)) +} + +// RemoveSandboxCgroup calls the helper function removeSandboxCgroup for this manager. +func (m *SystemdManager) RemoveSandboxCgroup(sbParent, containerID string) error { + // sbParent should always be specified by kubelet, but sometimes not by critest/crictl. + // Skip creation in this case. + if sbParent == "" { + logrus.Infof("Not creating sandbox cgroup: sbParent is empty") + return nil + } + expandedParent, err := systemd.ExpandSlice(sbParent) + if err != nil { + return err + } + return removeSandboxCgroup(expandedParent, containerCgroupPath(containerID)) } diff --git a/server/sandbox_remove.go b/server/sandbox_remove.go index 6bbc9ac1e14..6ddd8e01bd2 100644 --- a/server/sandbox_remove.go +++ b/server/sandbox_remove.go @@ -52,6 +52,11 @@ func (s *Server) removePodSandbox(ctx context.Context, sb *sandbox.Sandbox) erro if err := s.removeContainerInPod(ctx, sb, sb.InfraContainer()); err != nil { return err } + if sb.InfraContainer().Spoofed() { + if err := s.config.CgroupManager().RemoveSandboxCgroup(sb.CgroupParent(), sb.ID()); err != nil { + return err + } + } // Cleanup network resources for this pod if err := s.networkStop(ctx, sb); err != nil { diff --git a/test/cgroups.bats b/test/cgroups.bats index 498502b2170..77d0a340b09 100644 --- a/test/cgroups.bats +++ b/test/cgroups.bats @@ -38,6 +38,28 @@ function teardown() { [[ "$output" == *"customcrioconmon.slice"* ]] } +@test "conmon custom cgroup with no infra container" { + parent="Burstablecriotest123" + if [ "$CONTAINER_CGROUP_MANAGER" == "systemd" ]; then + parent="$parent".slice + fi + cgroup_base="/sys/fs/cgroup" + if ! is_cgroup_v2; then + cgroup_base="$cgroup_base"/memory + fi + + CONTAINER_DROP_INFRA_CTR=true start_crio + + jq --arg cg "$parent" ' .linux.cgroup_parent = $cg' \ + "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox_config_slice.json + + pod_id=$(crictl runp "$TESTDIR"/sandbox_config_slice.json) + ls "$cgroup_base"/"$parent"/crio-"$pod_id"* + + crictl rmp -fa + ! ls "$cgroup_base"/"$parent"/crio-"$pod_id"* +} + @test "ctr with swap should be configured" { if ! grep -v Filename < /proc/swaps; then skip "swap not enabled" From 2a589fa7a764e8aedcf9865a2a417b0d14475567 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 1 May 2023 10:24:48 -0400 Subject: [PATCH 2/3] test/pod.bats: update to current setup - skip test for cgroupfs - remove skip for runc 1.0.0-rc11 (very old now) - drop removal of cgroup parent (not required) Signed-off-by: Peter Hunt --- test/pod.bats | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/test/pod.bats b/test/pod.bats index 814b0fe494b..f46c7682f70 100644 --- a/test/pod.bats +++ b/test/pod.bats @@ -278,22 +278,17 @@ function teardown() { } @test "kubernetes pod terminationGracePeriod passthru" { - [ -v CIRCLECI ] && skip "runc v1.0.0-rc11 required" # TODO remove this - # Make sure there is no XDG_RUNTIME_DIR set, otherwise the test might end up using the user instance. # There is an assumption in the test to use the system instance of systemd (systemctl show). - CONTAINER_CGROUP_MANAGER="systemd" DBUS_SESSION_BUS_ADDRESS="" XDG_RUNTIME_DIR="" start_crio - - # for systemd, cgroup_parent should not be set - jq ' del(.linux.cgroup_parent)' \ - "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox.json + if [[ "$CONTAINER_CGROUP_MANAGER" != "systemd" ]]; then + skip "need systemd cgroup manager" + fi + # Make sure there is no XDG_RUNTIME_DIR set, otherwise the test might end up using the user instance. + DBUS_SESSION_BUS_ADDRESS="" XDG_RUNTIME_DIR="" start_crio jq ' .annotations += { "io.kubernetes.pod.terminationGracePeriod": "88" }' \ "$TESTDATA"/container_sleep.json > "$TESTDIR"/ctr.json - pod_id=$(crictl runp "$TESTDIR"/sandbox.json) - ctr_id=$(crictl create "$pod_id" "$TESTDIR"/ctr.json "$TESTDIR"/sandbox.json) - - crictl start "$ctr_id" + ctr_id=$(crictl run "$TESTDIR"/ctr.json "$TESTDATA"/sandbox_config.json) output=$(systemctl show "crio-${ctr_id}.scope") echo "$output" | grep 'TimeoutStopUSec=' || true # show From 1edcf7f17205393d0851565d4bc02fe53d7a1fb1 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 22 May 2023 15:47:36 -0400 Subject: [PATCH 3/3] cgmgr: update to use old libcontainer cgroup impl Signed-off-by: Peter Hunt --- internal/config/cgmgr/cgmgr.go | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/internal/config/cgmgr/cgmgr.go b/internal/config/cgmgr/cgmgr.go index e5e7675f215..ab02ec9d42f 100644 --- a/internal/config/cgmgr/cgmgr.go +++ b/internal/config/cgmgr/cgmgr.go @@ -11,9 +11,11 @@ import ( "strconv" "strings" + "github.com/containers/podman/v3/pkg/rootless" "github.com/cri-o/cri-o/internal/config/node" libctr "github.com/opencontainers/runc/libcontainer/cgroups" - libctrCgMgr "github.com/opencontainers/runc/libcontainer/cgroups/manager" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" cgcfgs "github.com/opencontainers/runc/libcontainer/configs" rspec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -170,22 +172,22 @@ func MoveProcessToContainerCgroup(containerPid, commandPid int) error { // createSandboxCgroup takes the path of the sandbox parent and the desired containerCgroup // It creates a cgroup through cgroupfs (as opposed to systemd) at the location cgroupRoot/sbParent/containerCgroup. func createSandboxCgroup(sbParent, containerCgroup string) error { - cg := &cgcfgs.Cgroup{ - Name: containerCgroup, - Parent: sbParent, - Resources: &cgcfgs.Resources{ - SkipDevices: true, - }, - } - mgr, err := libctrCgMgr.New(cg) + mgr, err := libctrCgroupManager(sbParent, containerCgroup) if err != nil { return err } - return mgr.Apply(-1) } func removeSandboxCgroup(sbParent, containerCgroup string) error { + mgr, err := libctrCgroupManager(sbParent, containerCgroup) + if err != nil { + return err + } + return mgr.Destroy() +} + +func libctrCgroupManager(sbParent, containerCgroup string) (libctr.Manager, error) { cg := &cgcfgs.Cgroup{ Name: containerCgroup, Parent: sbParent, @@ -193,12 +195,10 @@ func removeSandboxCgroup(sbParent, containerCgroup string) error { SkipDevices: true, }, } - mgr, err := libctrCgMgr.New(cg) - if err != nil { - return err + if node.CgroupIsV2() { + return fs2.NewManager(cg, "", rootless.IsRootless()) } - - return mgr.Destroy() + return fs.NewManager(cg, nil, rootless.IsRootless()), nil } func containerCgroupPath(id string) string {