diff --git a/go.mod b/go.mod index 0d2a94fe2a9..2f3d8908720 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,7 @@ require ( github.com/onsi/gomega v1.11.0 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.0.2-0.20200206005212-79b036d80240 - github.com/opencontainers/runc v1.0.0-rc94 + github.com/opencontainers/runc v1.0.0-rc95.0.20210521141834-a95237f81684 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/opencontainers/runtime-tools v0.9.1-0.20200121211434-d1bf3e66ff0a github.com/opencontainers/selinux v1.8.1 diff --git a/go.sum b/go.sum index f9a3522194d..7817ab17621 100644 --- a/go.sum +++ b/go.sum @@ -1030,8 +1030,9 @@ github.com/opencontainers/runc v1.0.0-rc8.0.20190926000215-3e425f80a8c9/go.mod h github.com/opencontainers/runc v1.0.0-rc9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runc v1.0.0-rc91/go.mod h1:3Sm6Dt7OT8z88EbdQqqcRN2oCT54jbi72tT/HqgflT8= github.com/opencontainers/runc v1.0.0-rc93/go.mod h1:3NOsor4w32B2tC0Zbl8Knk4Wg84SM2ImC1fxBuqJ/H0= -github.com/opencontainers/runc v1.0.0-rc94 h1:atqAFoBGp+Wkh9HKpYN3g/8NCbMzYG6SJrr+YgwamgM= github.com/opencontainers/runc v1.0.0-rc94/go.mod h1:z+bZxa/+Tz/FmYVWkhUajJdzFeOqjc5vrqskhVyHGUM= +github.com/opencontainers/runc v1.0.0-rc95.0.20210521141834-a95237f81684 h1:lxWmdjKd6ohpRh4G2ogFNS4EAyAbwWZnlOcfYfpu22s= +github.com/opencontainers/runc v1.0.0-rc95.0.20210521141834-a95237f81684/go.mod h1:z+bZxa/+Tz/FmYVWkhUajJdzFeOqjc5vrqskhVyHGUM= github.com/opencontainers/runtime-spec v1.0.3-0.20201121164853-7413a7f753e1 h1:UAfI7SOCo1CNIu3RevW9B4HQyf7SY5aSzcSeoC7OPs0= github.com/opencontainers/runtime-spec v1.0.3-0.20201121164853-7413a7f753e1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= diff --git a/internal/config/cgmgr/cgmgr.go b/internal/config/cgmgr/cgmgr.go index 3a0d8a9551d..220734d821e 100644 --- a/internal/config/cgmgr/cgmgr.go +++ b/internal/config/cgmgr/cgmgr.go @@ -58,11 +58,11 @@ type CgroupManager interface { // returns the cgroup parent, cgroup path, and error. For systemd cgroups, // it also checks there is enough memory in the given cgroup SandboxCgroupPath(string, string) (string, string, error) - // MoveConmonToCgroup takes the container ID, cgroup parent, conmon's cgroup (from the config) and conmon's PID - // It attempts to move conmon to the correct cgroup. + // MoveConmonToCgroup takes the container ID, cgroup parent, conmon's cgroup (from the config), conmon's PID, and some customized resources + // It attempts to move conmon to the correct cgroup, and set the resources for that cgroup. // It returns the cgroupfs parent that conmon was put into // so that CRI-O can clean the parent cgroup of the newly added conmon once the process terminates (systemd handles this for us) - MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int) (string, error) + MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int, resources *rspec.LinuxResources) (string, error) // CreateSandboxCgroup takes the sandbox parent, and sandbox ID. // It creates a new cgroup for that sandbox, which is useful when spoofing an infra container. CreateSandboxCgroup(sbParent, containerID string) error diff --git a/internal/config/cgmgr/cgmgr_test.go b/internal/config/cgmgr/cgmgr_test.go index 06e736a934a..6ccceea215f 100644 --- a/internal/config/cgmgr/cgmgr_test.go +++ b/internal/config/cgmgr/cgmgr_test.go @@ -160,7 +160,7 @@ var _ = t.Describe("Config", func() { // Given conmonCgroup := "notPodOrEmpty" // When - cgPath, err := sut.MoveConmonToCgroup("", "", conmonCgroup, 0) + cgPath, err := sut.MoveConmonToCgroup("", "", conmonCgroup, 0, nil) // Then Expect(cgPath).To(BeEmpty()) @@ -245,7 +245,7 @@ var _ = t.Describe("Config", func() { // Given conmonCgroup := "notPodOrEmpty" // When - cgPath, err := sut.MoveConmonToCgroup("", "", conmonCgroup, -1) + cgPath, err := sut.MoveConmonToCgroup("", "", conmonCgroup, -1, nil) // Then Expect(cgPath).To(BeEmpty()) diff --git a/internal/config/cgmgr/cgroupfs.go b/internal/config/cgmgr/cgroupfs.go index b7eb04c5dd1..0821d389342 100644 --- a/internal/config/cgmgr/cgroupfs.go +++ b/internal/config/cgmgr/cgroupfs.go @@ -9,6 +9,13 @@ import ( "strings" "github.com/containers/podman/v3/pkg/cgroups" + "github.com/containers/podman/v3/pkg/rootless" + "github.com/cri-o/cri-o/internal/config/node" + libctr "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + cgcfgs "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" rspec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -68,11 +75,15 @@ func (m *CgroupfsManager) SandboxCgroupPath(sbParent, sbID string) (cgParent, cg // It attempts to move conmon to the correct cgroup. // It returns the cgroupfs parent that conmon was put into // so that CRI-O can clean the cgroup path of the newly added conmon once the process terminates (systemd handles this for us) -func (*CgroupfsManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int) (string, error) { +func (*CgroupfsManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int, resources *rspec.LinuxResources) (cgroupPathToClean string, _ error) { if conmonCgroup != "pod" && conmonCgroup != "" { return "", errors.Errorf("conmon cgroup %s invalid for cgroupfs", conmonCgroup) } + if resources == nil { + resources = &rspec.LinuxResources{} + } + cgroupPath := fmt.Sprintf("%s/crio-conmon-%s", cgroupParent, cid) control, err := cgroups.New(cgroupPath, &rspec.LinuxResources{}) if err != nil { @@ -82,6 +93,10 @@ func (*CgroupfsManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup strin return cgroupPath, nil } + if err := setWorkloadSettings(cgroupPath, resources); err != nil { + return cgroupPath, err + } + // Record conmon's cgroup path in the container, so we can properly // clean it up when removing the container. // Here we should defer a crio-connmon- cgroup hierarchy deletion, but it will @@ -96,6 +111,51 @@ func (*CgroupfsManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup strin return cgroupPath, nil } +func setWorkloadSettings(cgPath string, resources *rspec.LinuxResources) error { + var mgr libctr.Manager + if resources.CPU == nil { + return nil + } + + paths := map[string]string{ + "cpuset": filepath.Join("/sys/fs/cgroup", "cpuset", cgPath), + "cpu": filepath.Join("/sys/fs/cgroup", "cpu", cgPath), + "freezer": filepath.Join("/sys/fs/cgroup", "freezer", cgPath), + "devices": filepath.Join("/sys/fs/cgroup", "devices", cgPath), + } + + cg := &cgcfgs.Cgroup{ + Name: cgPath, + Resources: &cgcfgs.Resources{}, + } + if resources.CPU.Cpus != "" { + cg.Resources.CpusetCpus = resources.CPU.Cpus + } + if resources.CPU.Shares != nil { + cg.Resources.CpuShares = *resources.CPU.Shares + } + + // We need to white list all devices + // so containers created underneath won't fail + cg.Resources.Devices = []*devices.Rule{ + { + Type: devices.WildcardDevice, + Allow: true, + }, + } + + if node.CgroupIsV2() { + var err error + mgr, err = fs2.NewManager(cg, cgPath, rootless.IsRootless()) + if err != nil { + return err + } + } else { + mgr = fs.NewManager(cg, paths, rootless.IsRootless()) + } + return mgr.Set(cg.Resources) +} + // CreateSandboxCgroup calls the helper function createSandboxCgroup for this manager. func (m *CgroupfsManager) CreateSandboxCgroup(sbParent, containerID string) error { return createSandboxCgroup(sbParent, containerID, m) diff --git a/internal/config/cgmgr/systemd.go b/internal/config/cgmgr/systemd.go index 82138a65079..32e7948ea28 100644 --- a/internal/config/cgmgr/systemd.go +++ b/internal/config/cgmgr/systemd.go @@ -9,9 +9,11 @@ import ( "strings" systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/cri-o/cri-o/internal/config/node" "github.com/cri-o/cri-o/utils" "github.com/godbus/dbus/v5" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + rspec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -68,7 +70,7 @@ func (*SystemdManager) ContainerCgroupAbsolutePath(sbParent, containerID string) // cgroupPathToClean should always be returned empty. It is part of the interface to return the cgroup path // that cri-o is responsible for cleaning up upon the container's death. // Systemd takes care of this cleaning for us, so return an empty string -func (*SystemdManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int) (cgroupPathToClean string, _ error) { +func (*SystemdManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string, pid int, resources *rspec.LinuxResources) (cgroupPathToClean string, _ error) { if strings.HasSuffix(conmonCgroup, ".slice") { cgroupParent = conmonCgroup } @@ -77,12 +79,39 @@ func (*SystemdManager) MoveConmonToCgroup(cid, cgroupParent, conmonCgroup string // Set the systemd KillSignal to SIGPIPE that conmon ignores. // This helps during node shutdown so that conmon waits for the container // to exit and doesn't forward the SIGTERM that it gets. - killSignalProp := systemdDbus.Property{ - Name: "KillSignal", - Value: dbus.MakeVariant(int(unix.SIGPIPE)), + props := []systemdDbus.Property{ + { + Name: "KillSignal", + Value: dbus.MakeVariant(int(unix.SIGPIPE)), + }, + systemdDbus.PropAfter("crio.service"), } + + if resources != nil && resources.CPU != nil { + if resources.CPU.Cpus != "" { + if !node.SystemdHasAllowedCPUs() { + logrus.Errorf("Systemd does not support AllowedCPUs; skipping setting for workload") + } else { + bits, err := systemd.RangeToBits(resources.CPU.Cpus) + if err != nil { + return "", errors.Wrapf(err, "cpuset conversion error") + } + props = append(props, systemdDbus.Property{ + Name: "AllowedCPUs", + Value: dbus.MakeVariant(bits), + }) + } + } + if resources.CPU.Shares != nil { + props = append(props, systemdDbus.Property{ + Name: "CPUShares", + Value: dbus.MakeVariant(resources.CPU.Shares), + }) + } + } + logrus.Debugf("Running conmon under slice %s and unitName %s", cgroupParent, conmonUnitName) - if err := utils.RunUnderSystemdScope(pid, cgroupParent, conmonUnitName, killSignalProp, systemdDbus.PropAfter("crio.service")); err != nil { + if err := utils.RunUnderSystemdScope(pid, cgroupParent, conmonUnitName, props...); err != nil { return "", errors.Wrapf(err, "failed to add conmon to systemd sandbox cgroup") } // return empty string as path because cgroup cleanup is done by systemd diff --git a/internal/config/node/node.go b/internal/config/node/node.go index 63a3595de08..44c026f1ca7 100644 --- a/internal/config/node/node.go +++ b/internal/config/node/node.go @@ -55,6 +55,13 @@ func ValidateConfig() error { activated: &systemdHasCollectMode, fatal: false, }, + { + name: "systemd AllowedCPUs", + init: SystemdHasAllowedCPUs, + err: &systemdHasAllowedCPUsErr, + activated: &systemdHasAllowedCPUs, + fatal: false, + }, { name: "fs.may_detach_mounts sysctl", init: checkFsMayDetachMounts, diff --git a/internal/config/node/systemd.go b/internal/config/node/systemd.go index ca491159dd9..666776e2780 100644 --- a/internal/config/node/systemd.go +++ b/internal/config/node/systemd.go @@ -13,17 +13,35 @@ var ( systemdHasCollectModeOnce sync.Once systemdHasCollectMode bool systemdHasCollectModeErr error + + systemdHasAllowedCPUsOnce sync.Once + systemdHasAllowedCPUs bool + systemdHasAllowedCPUsErr error ) func SystemdHasCollectMode() bool { systemdHasCollectModeOnce.Do(func() { - // This will show whether the currently running systemd supports CollectMode - _, err := exec.Command("systemctl", "show", "-p", "CollectMode", "systemd").Output() - if err != nil { - systemdHasCollectModeErr = errors.Wrapf(err, "check systemd CollectMode") - return - } - systemdHasCollectMode = true + systemdHasCollectMode, systemdHasCollectModeErr = systemdSupportsProperty("CollectMode") }) return systemdHasCollectMode } + +func SystemdHasAllowedCPUs() bool { + systemdHasAllowedCPUsOnce.Do(func() { + systemdHasAllowedCPUs, systemdHasAllowedCPUsErr = systemdSupportsProperty("AllowedCPUs") + }) + return systemdHasAllowedCPUs +} + +// systemdSupportsProperty checks whether systemd supports a property +// It returns an error if it does not. +func systemdSupportsProperty(property string) (bool, error) { + output, err := exec.Command("systemctl", "show", "-p", property, "systemd").Output() + if err != nil { + return false, errors.Wrapf(err, "check systemd %s", property) + } + if len(output) == 0 { + return false, nil + } + return true, nil +} diff --git a/internal/oci/oci_linux.go b/internal/oci/oci_linux.go index a200195ddc4..59ec75543b1 100644 --- a/internal/oci/oci_linux.go +++ b/internal/oci/oci_linux.go @@ -14,6 +14,9 @@ import ( "github.com/containers/podman/v3/pkg/cgroups" "github.com/cri-o/cri-o/internal/config/node" + "github.com/cri-o/cri-o/server/cri/types" + rspec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -23,8 +26,20 @@ func (r *runtimeOCI) createContainerPlatform(c *Container, cgroupParent string, if c.Spoofed() { return nil } + g := &generate.Generator{ + Config: &rspec.Spec{ + Linux: &rspec.Linux{ + Resources: &rspec.LinuxResources{}, + }, + }, + } + // Mutate our newly created spec to find the customizations that are needed for conmon + if err := r.config.Workloads.MutateSpecGivenAnnotations(types.InfraContainerName, g, c.Annotations()); err != nil { + return err + } + // Move conmon to specified cgroup - conmonCgroupfsPath, err := r.config.CgroupManager().MoveConmonToCgroup(c.id, cgroupParent, r.config.ConmonCgroup, pid) + conmonCgroupfsPath, err := r.config.CgroupManager().MoveConmonToCgroup(c.id, cgroupParent, r.config.ConmonCgroup, pid, g.Config.Linux.Resources) if err != nil { return err } diff --git a/server/cri/types/types.go b/server/cri/types/types.go index 01e8939860a..3eaa45b7290 100644 --- a/server/cri/types/types.go +++ b/server/cri/types/types.go @@ -22,6 +22,8 @@ const ( PodSandboxStateSandboxReady PodSandboxState = 0 PodSandboxStateSandboxNotReady PodSandboxState = 1 + + InfraContainerName = "POD" ) type VersionRequest struct { diff --git a/server/naming.go b/server/naming.go index 9185150c884..f413bda05bd 100644 --- a/server/naming.go +++ b/server/naming.go @@ -10,14 +10,13 @@ import ( const ( kubePrefix = "k8s" - infraName = "POD" nameDelimiter = "_" ) func makeSandboxContainerName(sandboxConfig *types.PodSandboxConfig) string { return strings.Join([]string{ kubePrefix, - infraName, + types.InfraContainerName, sandboxConfig.Metadata.Name, sandboxConfig.Metadata.Namespace, sandboxConfig.Metadata.UID, diff --git a/server/sandbox_run_linux.go b/server/sandbox_run_linux.go index deb6c6f4cc4..d86b9da3427 100644 --- a/server/sandbox_run_linux.go +++ b/server/sandbox_run_linux.go @@ -38,7 +38,6 @@ import ( "golang.org/x/net/context" "golang.org/x/sys/unix" "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/kubernetes/pkg/kubelet/leaky" kubeletTypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -281,7 +280,7 @@ func (s *Server) runPodSandbox(ctx context.Context, req *types.RunPodSandboxRequ pathsToChown := []string{} // we need to fill in the container name, as it is not present in the request. Luckily, it is a constant. - log.Infof(ctx, "Running pod sandbox: %s%s", translateLabelsToDescription(sbox.Config().Labels), leaky.PodInfraContainerName) + log.Infof(ctx, "Running pod sandbox: %s%s", translateLabelsToDescription(sbox.Config().Labels), types.InfraContainerName) kubeName := sbox.Config().Metadata.Name namespace := sbox.Config().Metadata.Namespace @@ -461,7 +460,7 @@ func (s *Server) runPodSandbox(ctx context.Context, req *types.RunPodSandboxRequ // Add special container name label for the infra container if labels != nil { - labels[kubeletTypes.KubernetesContainerNameLabel] = leaky.PodInfraContainerName + labels[kubeletTypes.KubernetesContainerNameLabel] = types.InfraContainerName } labelsJSON, err := json.Marshal(labels) if err != nil { diff --git a/server/server.go b/server/server.go index 222fc549ff7..6604748b7a2 100644 --- a/server/server.go +++ b/server/server.go @@ -207,7 +207,7 @@ func (s *Server) restore(ctx context.Context) []string { log.Warnf(ctx, "unable to delete container %s: %v", n, err) } // Release the infra container name and the pod name for future use - if strings.Contains(n, infraName) { + if strings.Contains(n, types.InfraContainerName) { s.ReleaseContainerName(n) } else { s.ReleasePodName(n) diff --git a/test/workloads.bats b/test/workloads.bats index 49848bfd7bf..d4d262b2f07 100644 --- a/test/workloads.bats +++ b/test/workloads.bats @@ -8,6 +8,8 @@ function setup() { setup_test sboxconfig="$TESTDIR/sbox.json" ctrconfig="$TESTDIR/ctr.json" + systemd_supports_cpuset=$(systemctl show --property=AllowedCPUs systemd || true) + export systemd_supports_cpuset } function teardown() { @@ -47,6 +49,58 @@ function check_cpu_fields() { fi } +function check_conmon_fields() { + local ctr_id="$1" + local cpushares="$2" + local cpuset="$3" + + if [[ "$CONTAINER_CGROUP_MANAGER" == "cgroupfs" ]]; then + if is_cgroup_v2; then + cpuset_path="/sys/fs/cgroup" + cpushare_path="/sys/fs/cgroup" + cpushare_filename="cpu.weight" + # see https://github.com/containers/crun/blob/e5874864918f8f07acdff083f83a7a59da8abb72/crun.1.md#cpu-controller for conversion + cpushares=$((1 + ((cpushares - 2) * 9999) / 262142)) + else + cpuset_path="/sys/fs/cgroup/cpuset" + cpushare_path="/sys/fs/cgroup/cpu" + cpushare_filename="cpu.shares" + fi + + found_cpuset=$(cat "$cpuset_path/pod_123-456/crio-conmon-$ctr_id/cpuset.cpus") + if [ -z "$cpuset" ]; then + [[ $(cat "$cpuset_path/pod_123-456/cpuset.cpus") == *"$found_cpuset"* ]] + else + [[ "$cpuset" == *"$found_cpuset"* ]] + fi + + found_cpushares=$(cat "$cpushare_path/pod_123-456/crio-conmon-$ctr_id/$cpushare_filename") + if [ -z "$cpushares" ]; then + [[ $(cat "$cpushare_path/pod_123-456/$cpushare_filename") == *"$found_cpushares"* ]] + else + [[ "$cpushares" == *"$found_cpushares"* ]] + fi + else + # don't test cpuset if it's not supported by systemd + if [[ -n "$systemd_supports_cpuset" ]]; then + info="$(systemctl show --property=AllowedCPUs crio-conmon-"$ctr_id".scope)" + if [ -z "$cpuset" ]; then + echo "$info" | grep -E '^AllowedCPUs=$' + else + [[ "$info" == *"AllowedCPUs=$cpuset"* ]] + fi + fi + + info="$(systemctl show --property=CPUShares crio-conmon-"$ctr_id".scope)" + if [ -z "$cpushares" ]; then + # 18446744073709551615 is 2^64-1, which is the default systemd set in RHEL 7 + echo "$info" | grep -E '^CPUShares=\[not set\]$' || echo "$info" | grep 'CPUShares=18446744073709551615' + else + [[ "$info" == *"CPUShares=$cpushares"* ]] + fi + fi +} + @test "test workload gets configured to defaults" { shares="200" set="0-1" @@ -69,7 +123,7 @@ function check_cpu_fields() { shares="200" set="0-1" name=helloctr - create_workload "$shares" "0-2" + create_workload "$shares" "0" start_crio @@ -88,7 +142,7 @@ function check_cpu_fields() { check_cpu_fields "$ctr_id" "$shares" "$set" } -@test "test workload should not set if not defaulted or specified" { +@test "test workload should not be set if not defaulted or specified" { shares="200" set="" name=helloctr @@ -111,7 +165,7 @@ function check_cpu_fields() { check_cpu_fields "$ctr_id" "$shares" "$set" } -@test "test workload should not set if annotation not specified" { +@test "test workload should not be set if annotation not specified" { shares="" set="" name=helloctr @@ -132,3 +186,86 @@ function check_cpu_fields() { ctr_id=$(crictl run "$ctrconfig" "$sboxconfig") check_cpu_fields "$ctr_id" "$shares" "$set" } + +@test "test workload pod gets configured to defaults" { + shares="200" + set="0-1" + create_workload "$shares" "$set" + + start_crio + + jq --arg act "$activation" ' .annotations[$act] = "true"' \ + "$TESTDATA"/sandbox_config.json > "$sboxconfig" + + jq --arg act "$activation" ' .annotations[$act] = "true"' \ + "$TESTDATA"/container_sleep.json > "$ctrconfig" + + ctr_id=$(crictl run "$ctrconfig" "$sboxconfig") + + check_conmon_fields "$ctr_id" "$shares" "$set" +} + +@test "test workload can override pod defaults" { + shares="200" + set="0-1" + name=POD + create_workload "$shares" "0" + + start_crio + + jq --arg act "$activation" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$act] = "true" + | .annotations[$setkey] = $set' \ + "$TESTDATA"/sandbox_config.json > "$sboxconfig" + + jq --arg act "$activation" --arg name "$name" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$act] = "true" + | .annotations[$setkey] = $set' \ + "$TESTDATA"/container_sleep.json > "$ctrconfig" + + ctr_id=$(crictl run "$ctrconfig" "$sboxconfig") + check_conmon_fields "$ctr_id" "$shares" "$set" +} + +@test "test workload pod should not be set if not defaulted or specified" { + shares="200" + set="" + name=POD + create_workload "$shares" "" + + start_crio + + jq --arg act "$activation" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$act] = "true" + | .annotations[$setkey] = $set' \ + "$TESTDATA"/sandbox_config.json > "$sboxconfig" + + jq --arg act "$activation" --arg name "$name" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$act] = "true" + | .annotations[$setkey] = $set' \ + "$TESTDATA"/container_sleep.json > "$ctrconfig" + + ctr_id=$(crictl run "$ctrconfig" "$sboxconfig") + check_conmon_fields "$ctr_id" "$shares" "$set" +} + +@test "test workload pod should not be set if annotation not specified" { + shares="" + set="" + name=POD + create_workload "200" "0-1" + + start_crio + + jq --arg act "$activation" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$setkey] = $set' \ + "$TESTDATA"/sandbox_config.json > "$sboxconfig" + + jq --arg act "$activation" --arg name "$name" --arg set "{\"cpuset\": \"$set\"}" --arg setkey "$prefix/$name" \ + ' .annotations[$setkey] = $set + | del(.linux.resources.cpu_shares)' \ + "$TESTDATA"/container_sleep.json > "$ctrconfig" + + ctr_id=$(crictl run "$ctrconfig" "$sboxconfig") + check_conmon_fields "$ctr_id" "$shares" "$set" +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go index 441531fd77d..6afd17851ad 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go @@ -3,9 +3,12 @@ package fs2 import ( + "bufio" stdErrors "errors" + "fmt" "os" "strings" + "time" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" @@ -14,16 +17,6 @@ import ( ) func setFreezer(dirPath string, state configs.FreezerState) error { - if err := supportsFreezer(dirPath); err != nil { - // We can ignore this request as long as the user didn't ask us to - // freeze the container (since without the freezer cgroup, that's a - // no-op). - if state == configs.Undefined || state == configs.Thawed { - return nil - } - return errors.Wrap(err, "freezer not supported") - } - var stateStr string switch state { case configs.Undefined: @@ -36,11 +29,23 @@ func setFreezer(dirPath string, state configs.FreezerState) error { return errors.Errorf("invalid freezer state %q requested", state) } - if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil { + fd, err := fscommon.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) + if err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state != configs.Frozen { + return nil + } + return errors.Wrap(err, "freezer not supported") + } + defer fd.Close() + + if _, err := fd.WriteString(stateStr); err != nil { return err } // Confirm that the cgroup did actually change states. - if actualState, err := getFreezer(dirPath); err != nil { + if actualState, err := readFreezer(dirPath, fd); err != nil { return err } else if actualState != state { return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) @@ -48,13 +53,8 @@ func setFreezer(dirPath string, state configs.FreezerState) error { return nil } -func supportsFreezer(dirPath string) error { - _, err := fscommon.ReadFile(dirPath, "cgroup.freeze") - return err -} - func getFreezer(dirPath string) (configs.FreezerState, error) { - state, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + fd, err := fscommon.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) if err != nil { // If the kernel is too old, then we just treat the freezer as being in // an "undefined" state. @@ -63,12 +63,67 @@ func getFreezer(dirPath string) (configs.FreezerState, error) { } return configs.Undefined, err } - switch strings.TrimSpace(state) { - case "0": + defer fd.Close() + + return readFreezer(dirPath, fd) +} + +func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) { + if _, err := fd.Seek(0, 0); err != nil { + return configs.Undefined, err + } + state := make([]byte, 2) + if _, err := fd.Read(state); err != nil { + return configs.Undefined, err + } + switch string(state) { + case "0\n": return configs.Thawed, nil - case "1": - return configs.Frozen, nil + case "1\n": + return waitFrozen(dirPath) default: return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state) } } + +// waitFrozen polls cgroup.events until it sees "frozen 1" in it. +func waitFrozen(dirPath string) (configs.FreezerState, error) { + fd, err := fscommon.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) + if err != nil { + return configs.Undefined, err + } + defer fd.Close() + + // XXX: Simple wait/read/retry is used here. An implementation + // based on poll(2) or inotify(7) is possible, but it makes the code + // much more complicated. Maybe address this later. + const ( + // Perform maxIter with waitTime in between iterations. + waitTime = 10 * time.Millisecond + maxIter = 1000 + ) + scanner := bufio.NewScanner(fd) + for i := 0; scanner.Scan(); { + if i == maxIter { + return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) + } + line := scanner.Text() + val := strings.TrimPrefix(line, "frozen ") + if val != line { // got prefix + if val[0] == '1' { + return configs.Frozen, nil + } + + i++ + // wait, then re-read + time.Sleep(waitTime) + _, err := fd.Seek(0, 0) + if err != nil { + return configs.Undefined, err + } + } + } + // Should only reach here either on read error, + // or if the file does not contain "frozen " line. + return configs.Undefined, scanner.Err() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go index 91c314e09ea..de69617ee44 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go @@ -476,7 +476,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st } if cpus != "" { - bits, err := rangeToBits(cpus) + bits, err := RangeToBits(cpus) if err != nil { return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", cpus, err) @@ -485,7 +485,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st newProp("AllowedCPUs", bits)) } if mems != "" { - bits, err := rangeToBits(mems) + bits, err := RangeToBits(mems) if err != nil { return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", mems, err) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go index 07098218883..264f4c89353 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go @@ -9,11 +9,11 @@ import ( "github.com/willf/bitset" ) -// rangeToBits converts a text representation of a CPU mask (as written to +// RangeToBits converts a text representation of a CPU mask (as written to // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes // with the corresponding bits set (as consumed by systemd over dbus as // AllowedCPUs/AllowedMemoryNodes unit property value). -func rangeToBits(str string) ([]byte, error) { +func RangeToBits(str string) ([]byte, error) { bits := &bitset.BitSet{} for _, r := range strings.Split(str, ",") { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go index 8abb0feb748..a7a2264c78c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go @@ -96,7 +96,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props newProp("CPUWeight", num)) case "cpuset.cpus", "cpuset.mems": - bits, err := rangeToBits(v) + bits, err := RangeToBits(v) if err != nil { return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 042ba1a2e3a..14a0960389f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -31,9 +31,10 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` } // Action is taken upon rule match in Seccomp diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 1b72b7a1c1b..cd78f23e1bd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -3,12 +3,15 @@ package utils import ( "encoding/binary" "encoding/json" + "fmt" "io" "os" "path/filepath" + "strconv" "strings" "unsafe" + "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -88,6 +91,57 @@ func CleanPath(path string) string { return filepath.Clean(path) } +// stripRoot returns the passed path, stripping the root path if it was +// (lexicially) inside it. Note that both passed paths will always be treated +// as absolute, and the returned path will also always be absolute. In +// addition, the paths are cleaned before stripping the root. +func stripRoot(root, path string) string { + // Make the paths clean and absolute. + root, path = CleanPath("/"+root), CleanPath("/"+path) + switch { + case path == root: + path = "/" + case root == "/": + // do nothing + case strings.HasPrefix(path, root+"/"): + path = strings.TrimPrefix(path, root+"/") + } + return CleanPath("/" + path) +} + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %v", err) + } + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + // Double-check the path is the one we expected. + procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + // Run the closure. + return fn(procfd) +} + // SearchLabels searches a list of key-value pairs for the provided key and // returns the corresponding value. The pairs must be separated with '='. func SearchLabels(labels []string, query string) string { diff --git a/vendor/k8s.io/kubernetes/pkg/kubelet/leaky/leaky.go b/vendor/k8s.io/kubernetes/pkg/kubelet/leaky/leaky.go deleted file mode 100644 index 7c75002c47e..00000000000 --- a/vendor/k8s.io/kubernetes/pkg/kubelet/leaky/leaky.go +++ /dev/null @@ -1,25 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package leaky holds bits of kubelet that should be internal but have leaked -// out through bad abstractions. TODO: delete all of this. -package leaky - -const ( - // PodInfraContainerName is used in a few places outside of Kubelet, such as indexing - // into the container info. - PodInfraContainerName = "POD" -) diff --git a/vendor/modules.txt b/vendor/modules.txt index 3e38645c9c7..6c2af1ee9f9 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -700,7 +700,7 @@ github.com/opencontainers/go-digest ## explicit github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.0.0-rc94 +# github.com/opencontainers/runc v1.0.0-rc95.0.20210521141834-a95237f81684 ## explicit github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/cgroups @@ -1371,7 +1371,6 @@ k8s.io/kubernetes/pkg/kubelet/container k8s.io/kubernetes/pkg/kubelet/cri/streaming k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand -k8s.io/kubernetes/pkg/kubelet/leaky k8s.io/kubernetes/pkg/kubelet/types k8s.io/kubernetes/pkg/proxy k8s.io/kubernetes/pkg/proxy/config