diff --git a/pkg/annotations/annotations.go b/pkg/annotations/annotations.go index 51920ebca0f..97a0bde9605 100644 --- a/pkg/annotations/annotations.go +++ b/pkg/annotations/annotations.go @@ -8,6 +8,9 @@ const ( // UsernsMode is the user namespace mode to use UsernsModeAnnotation = "io.kubernetes.cri-o.userns-mode" + // CgroupRW specifies mounting v2 cgroups as an rw filesystem. + Cgroup2RWAnnotation = "io.kubernetes.cri-o.cgroup2-mount-hierarchy-rw" + // UnifiedCgroupAnnotation specifies the unified configuration for cgroup v2 UnifiedCgroupAnnotation = "io.kubernetes.cri-o.UnifiedCgroup" @@ -39,6 +42,7 @@ const ( var AllAllowedAnnotations = []string{ UsernsModeAnnotation, + Cgroup2RWAnnotation, UnifiedCgroupAnnotation, ShmSizeAnnotation, DevicesAnnotation, diff --git a/pkg/config/template.go b/pkg/config/template.go index fc8e85bed34..7b225de3010 100644 --- a/pkg/config/template.go +++ b/pkg/config/template.go @@ -1005,6 +1005,7 @@ const templateStringCrioRuntimeRuntimesRuntimeHandler = `# The "crio.runtime.run # a list of experimental annotations that this runtime handler is allowed to process. # The currently recognized values are: # "io.kubernetes.cri-o.userns-mode" for configuring a user namespace for the pod. +# "io.kubernetes.cri-o.cgroup2-mount-hierarchy-rw" for mounting cgroups writably when set to "true". # "io.kubernetes.cri-o.Devices" for configuring devices for the pod. # "io.kubernetes.cri-o.ShmSize" for configuring the size of /dev/shm. # "io.kubernetes.cri-o.UnifiedCgroup.$CTR_NAME" for configuring the cgroup v2 unified block for a container. diff --git a/server/container_create_linux.go b/server/container_create_linux.go index 4df71492861..0a8567d385a 100644 --- a/server/container_create_linux.go +++ b/server/container_create_linux.go @@ -291,7 +291,9 @@ func (s *Server) createSandboxContainer(ctx context.Context, ctr ctrIface.Contai skipRelabel = true } - containerVolumes, ociMounts, err := addOCIBindMounts(ctx, ctr, mountLabel, s.config.RuntimeConfig.BindMountPrefix, s.config.AbsentMountSourcesToReject, maybeRelabel, skipRelabel) + cgroup2RW := node.CgroupIsV2() && sb.Annotations()[crioann.Cgroup2RWAnnotation] == "true" + + containerVolumes, ociMounts, err := addOCIBindMounts(ctx, ctr, mountLabel, s.config.RuntimeConfig.BindMountPrefix, s.config.AbsentMountSourcesToReject, maybeRelabel, skipRelabel, cgroup2RW) if err != nil { return nil, err } @@ -843,7 +845,7 @@ func clearReadOnly(m *rspec.Mount) { m.Options = append(m.Options, "rw") } -func addOCIBindMounts(ctx context.Context, ctr ctrIface.Container, mountLabel, bindMountPrefix string, absentMountSourcesToReject []string, maybeRelabel, skipRelabel bool) ([]oci.ContainerVolume, []rspec.Mount, error) { +func addOCIBindMounts(ctx context.Context, ctr ctrIface.Container, mountLabel, bindMountPrefix string, absentMountSourcesToReject []string, maybeRelabel, skipRelabel, cgroup2RW bool) ([]oci.ContainerVolume, []rspec.Mount, error) { volumes := []oci.ContainerVolume{} ociMounts := []rspec.Mount{} containerConfig := ctr.Config() @@ -978,7 +980,13 @@ func addOCIBindMounts(ctx context.Context, ctr ctrIface.Container, mountLabel, b Destination: "/sys/fs/cgroup", Type: "cgroup", Source: "cgroup", - Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + Options: []string{"nosuid", "noexec", "nodev", "relatime"}, + } + + if cgroup2RW { + m.Options = append(m.Options, "rw") + } else { + m.Options = append(m.Options, "ro") } specgen.AddMount(m) } diff --git a/server/container_create_linux_test.go b/server/container_create_linux_test.go index 4653fcb1081..814688e2728 100644 --- a/server/container_create_linux_test.go +++ b/server/container_create_linux_test.go @@ -34,7 +34,7 @@ func TestAddOCIBindsForDev(t *testing.T) { t.Error(err) } - _, binds, err := addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false) + _, binds, err := addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false, false) if err != nil { t.Error(err) } @@ -78,7 +78,7 @@ func TestAddOCIBindsForSys(t *testing.T) { t.Error(err) } - _, binds, err := addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false) + _, binds, err := addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false, false) if err != nil { t.Error(err) } @@ -92,3 +92,72 @@ func TestAddOCIBindsForSys(t *testing.T) { t.Error("there is not a single /sys bind mount") } } + +func TestAddOCIBindsCGroupRW(t *testing.T) { + ctr, err := container.New() + if err != nil { + t.Error(err) + } + + if err := ctr.SetConfig(&types.ContainerConfig{ + Metadata: &types.ContainerMetadata{ + Name: "testctr", + }, + }, &types.PodSandboxConfig{ + Metadata: &types.PodSandboxMetadata{ + Name: "testpod", + }, + }); err != nil { + t.Error(err) + } + _, _, err = addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false, true) + if err != nil { + t.Error(err) + } + var hasCgroupRW bool + for _, m := range ctr.Spec().Mounts() { + if m.Destination == "/sys/fs/cgroup" { + for _, o := range m.Options { + if o == "rw" { + hasCgroupRW = true + } + } + } + } + if !hasCgroupRW { + t.Error("Cgroup mount not added with RW.") + } + + ctr, err = container.New() + if err != nil { + t.Error(err) + } + if err := ctr.SetConfig(&types.ContainerConfig{ + Metadata: &types.ContainerMetadata{ + Name: "testctr", + }, + }, &types.PodSandboxConfig{ + Metadata: &types.PodSandboxMetadata{ + Name: "testpod", + }, + }); err != nil { + t.Error(err) + } + var hasCgroupRO bool + _, _, err = addOCIBindMounts(context.Background(), ctr, "", "", nil, false, false, false) + if err != nil { + t.Error(err) + } + for _, m := range ctr.Spec().Mounts() { + if m.Destination == "/sys/fs/cgroup" { + for _, o := range m.Options { + if o == "ro" { + hasCgroupRO = true + } + } + } + } + if !hasCgroupRO { + t.Error("Cgroup mount not added with RO.") + } +}