diff --git a/internal/config/cgmgr/cgmgr.go b/internal/config/cgmgr/cgmgr.go index b650082c252..ab02ec9d42f 100644 --- a/internal/config/cgmgr/cgmgr.go +++ b/internal/config/cgmgr/cgmgr.go @@ -11,8 +11,12 @@ import ( "strconv" "strings" + "github.com/containers/podman/v3/pkg/rootless" "github.com/cri-o/cri-o/internal/config/node" libctr "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + cgcfgs "github.com/opencontainers/runc/libcontainer/configs" rspec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -74,6 +78,9 @@ type CgroupManager interface { // CreateSandboxCgroup takes the sandbox parent, and sandbox ID. // It creates a new cgroup for that sandbox, which is useful when spoofing an infra container. CreateSandboxCgroup(sbParent, containerID string) error + // RemoveSandboxCgroup takes the sandbox parent, and sandbox ID. + // It removes the cgroup for that sandbox, which is useful when spoofing an infra container. + RemoveSandboxCgroup(sbParent, containerID string) error } // New creates a new CgroupManager with defaults @@ -161,3 +168,39 @@ func MoveProcessToContainerCgroup(containerPid, commandPid int) error { } return nil } + +// createSandboxCgroup takes the path of the sandbox parent and the desired containerCgroup +// It creates a cgroup through cgroupfs (as opposed to systemd) at the location cgroupRoot/sbParent/containerCgroup. +func createSandboxCgroup(sbParent, containerCgroup string) error { + mgr, err := libctrCgroupManager(sbParent, containerCgroup) + if err != nil { + return err + } + return mgr.Apply(-1) +} + +func removeSandboxCgroup(sbParent, containerCgroup string) error { + mgr, err := libctrCgroupManager(sbParent, containerCgroup) + if err != nil { + return err + } + return mgr.Destroy() +} + +func libctrCgroupManager(sbParent, containerCgroup string) (libctr.Manager, error) { + cg := &cgcfgs.Cgroup{ + Name: containerCgroup, + Parent: sbParent, + Resources: &cgcfgs.Resources{ + SkipDevices: true, + }, + } + if node.CgroupIsV2() { + return fs2.NewManager(cg, "", rootless.IsRootless()) + } + return fs.NewManager(cg, nil, rootless.IsRootless()), nil +} + +func containerCgroupPath(id string) string { + return crioPrefix + "-" + id +} diff --git a/internal/config/cgmgr/cgroupfs.go b/internal/config/cgmgr/cgroupfs.go index 7f7ed393e64..8b63b79f840 100644 --- a/internal/config/cgmgr/cgroupfs.go +++ b/internal/config/cgmgr/cgroupfs.go @@ -180,18 +180,18 @@ func setWorkloadSettings(cgPath string, resources *rspec.LinuxResources) error { return mgr.Set(cg.Resources) } -// createSandboxCgroup takes the sandbox parent, and sandbox ID. -// It creates a new cgroup for that sandbox, which is useful when spoofing an infra container. -func createSandboxCgroup(sbParent, containerID string, mgr CgroupManager) error { - cgroupAbsolutePath, err := mgr.ContainerCgroupAbsolutePath(sbParent, containerID) - if err != nil { - return err - } - _, err = cgroups.New(cgroupAbsolutePath, &rspec.LinuxResources{}) - return err -} - // CreateSandboxCgroup calls the helper function createSandboxCgroup for this manager. func (m *CgroupfsManager) CreateSandboxCgroup(sbParent, containerID string) error { - return createSandboxCgroup(sbParent, containerID, m) + // prepend "/" to sbParent so the fs driver interprets it as an absolute path + // and the cgroup isn't created as a relative path to the cgroups of the CRI-O process. + // https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156 + return createSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID)) +} + +// RemoveSandboxCgroup calls the helper function removeSandboxCgroup for this manager. +func (m *CgroupfsManager) RemoveSandboxCgroup(sbParent, containerID string) error { + // prepend "/" to sbParent so the fs driver interprets it as an absolute path + // and the cgroup isn't created as a relative path to the cgroups of the CRI-O process. + // https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156 + return removeSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID)) } diff --git a/internal/config/cgmgr/systemd.go b/internal/config/cgmgr/systemd.go index 3c55280fe16..c0ec1ab9d19 100644 --- a/internal/config/cgmgr/systemd.go +++ b/internal/config/cgmgr/systemd.go @@ -89,7 +89,7 @@ func (m *SystemdManager) ContainerCgroupAbsolutePath(sbParent, containerID strin return "", errors.Wrapf(err, "error expanding systemd slice to get container %s stats", containerID) } - return filepath.Join(cgroup, crioPrefix+"-"+containerID+".scope"), nil + return filepath.Join(cgroup, containerCgroupPath(containerID)+".scope"), nil } // MoveConmonToCgroup takes the container ID, cgroup parent, conmon's cgroup (from the config) and conmon's PID @@ -200,8 +200,37 @@ func convertCgroupFsNameToSystemd(cgroupfsName string) string { } // CreateSandboxCgroup calls the helper function createSandboxCgroup for this manager. +// Note: createSandboxCgroup will create a cgroupfs cgroup for the infra container underneath the pod slice. +// It will not use dbus to create this cgroup, but instead call libcontainer's cgroupfs manager directly. +// This is because a scope created here will not have a process within it (as it's usually for a dropped infra container), +// and a slice cannot have the required `crio` prefix (while still being within the pod slice). +// Ultimately, this cgroup is required for cAdvisor to be able to register the pod and collect network metrics for it. +// This work will not be relevant when CRI-O is responsible for gathering pod metrics (KEP-2371), but is required until that's done. func (m *SystemdManager) CreateSandboxCgroup(sbParent, containerID string) error { - // If we are running systemd as cgroup driver then we would rely on - // systemd to create cgroups for us, there's nothing to do here in this case - return nil + // sbParent should always be specified by kubelet, but sometimes not by critest/crictl. + // Skip creation in this case. + if sbParent == "" { + logrus.Infof("Not creating sandbox cgroup: sbParent is empty") + return nil + } + expandedParent, err := systemd.ExpandSlice(sbParent) + if err != nil { + return err + } + return createSandboxCgroup(expandedParent, containerCgroupPath(containerID)) +} + +// RemoveSandboxCgroup calls the helper function removeSandboxCgroup for this manager. +func (m *SystemdManager) RemoveSandboxCgroup(sbParent, containerID string) error { + // sbParent should always be specified by kubelet, but sometimes not by critest/crictl. + // Skip creation in this case. + if sbParent == "" { + logrus.Infof("Not creating sandbox cgroup: sbParent is empty") + return nil + } + expandedParent, err := systemd.ExpandSlice(sbParent) + if err != nil { + return err + } + return removeSandboxCgroup(expandedParent, containerCgroupPath(containerID)) } diff --git a/server/sandbox_remove.go b/server/sandbox_remove.go index 6bbc9ac1e14..6ddd8e01bd2 100644 --- a/server/sandbox_remove.go +++ b/server/sandbox_remove.go @@ -52,6 +52,11 @@ func (s *Server) removePodSandbox(ctx context.Context, sb *sandbox.Sandbox) erro if err := s.removeContainerInPod(ctx, sb, sb.InfraContainer()); err != nil { return err } + if sb.InfraContainer().Spoofed() { + if err := s.config.CgroupManager().RemoveSandboxCgroup(sb.CgroupParent(), sb.ID()); err != nil { + return err + } + } // Cleanup network resources for this pod if err := s.networkStop(ctx, sb); err != nil { diff --git a/test/cgroups.bats b/test/cgroups.bats index 498502b2170..77d0a340b09 100644 --- a/test/cgroups.bats +++ b/test/cgroups.bats @@ -38,6 +38,28 @@ function teardown() { [[ "$output" == *"customcrioconmon.slice"* ]] } +@test "conmon custom cgroup with no infra container" { + parent="Burstablecriotest123" + if [ "$CONTAINER_CGROUP_MANAGER" == "systemd" ]; then + parent="$parent".slice + fi + cgroup_base="/sys/fs/cgroup" + if ! is_cgroup_v2; then + cgroup_base="$cgroup_base"/memory + fi + + CONTAINER_DROP_INFRA_CTR=true start_crio + + jq --arg cg "$parent" ' .linux.cgroup_parent = $cg' \ + "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox_config_slice.json + + pod_id=$(crictl runp "$TESTDIR"/sandbox_config_slice.json) + ls "$cgroup_base"/"$parent"/crio-"$pod_id"* + + crictl rmp -fa + ! ls "$cgroup_base"/"$parent"/crio-"$pod_id"* +} + @test "ctr with swap should be configured" { if ! grep -v Filename < /proc/swaps; then skip "swap not enabled" diff --git a/test/pod.bats b/test/pod.bats index 814b0fe494b..f46c7682f70 100644 --- a/test/pod.bats +++ b/test/pod.bats @@ -278,22 +278,17 @@ function teardown() { } @test "kubernetes pod terminationGracePeriod passthru" { - [ -v CIRCLECI ] && skip "runc v1.0.0-rc11 required" # TODO remove this - # Make sure there is no XDG_RUNTIME_DIR set, otherwise the test might end up using the user instance. # There is an assumption in the test to use the system instance of systemd (systemctl show). - CONTAINER_CGROUP_MANAGER="systemd" DBUS_SESSION_BUS_ADDRESS="" XDG_RUNTIME_DIR="" start_crio - - # for systemd, cgroup_parent should not be set - jq ' del(.linux.cgroup_parent)' \ - "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox.json + if [[ "$CONTAINER_CGROUP_MANAGER" != "systemd" ]]; then + skip "need systemd cgroup manager" + fi + # Make sure there is no XDG_RUNTIME_DIR set, otherwise the test might end up using the user instance. + DBUS_SESSION_BUS_ADDRESS="" XDG_RUNTIME_DIR="" start_crio jq ' .annotations += { "io.kubernetes.pod.terminationGracePeriod": "88" }' \ "$TESTDATA"/container_sleep.json > "$TESTDIR"/ctr.json - pod_id=$(crictl runp "$TESTDIR"/sandbox.json) - ctr_id=$(crictl create "$pod_id" "$TESTDIR"/ctr.json "$TESTDIR"/sandbox.json) - - crictl start "$ctr_id" + ctr_id=$(crictl run "$TESTDIR"/ctr.json "$TESTDATA"/sandbox_config.json) output=$(systemctl show "crio-${ctr_id}.scope") echo "$output" | grep 'TimeoutStopUSec=' || true # show