From 0db299c161b82c9af7443eb076952170b70321f9 Mon Sep 17 00:00:00 2001 From: Artyom Lukianov Date: Wed, 6 Apr 2022 16:19:36 +0300 Subject: [PATCH] Retry to set CPU load balancing before return the error It possible that the kernel will rebuild sched_domain related files and because of it enabling or disabling CPU load balancing for container CPUs will fail with different file errors: 1. lstat /proc/sys/kernel/sched_domain/cpu22/domain1/flags: no such file or directory 2. readdirent /proc/sys/kernel/sched_domain/cpu66/domain0: no such file or directory Add retry logic around setting CPU load balancing values to reduce possibility of such errors. Signed-off-by: Artyom Lukianov --- .../high_performance_hooks.go | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/internal/runtimehandlerhooks/high_performance_hooks.go b/internal/runtimehandlerhooks/high_performance_hooks.go index f270cd722b0..9c11eb0acc2 100644 --- a/internal/runtimehandlerhooks/high_performance_hooks.go +++ b/internal/runtimehandlerhooks/high_performance_hooks.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/cri-o/cri-o/internal/config/cgmgr" "github.com/cri-o/cri-o/internal/lib/sandbox" @@ -21,6 +22,7 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" ) @@ -65,8 +67,7 @@ func (h *HighPerformanceHooks) PreStart(ctx context.Context, c *oci.Container, s // disable the CPU load balancing for the container CPUs if shouldCPULoadBalancingBeDisabled(s.Annotations()) { - log.Infof(ctx, "Disable cpu load balancing for container %q", c.ID()) - if err := setCPUSLoadBalancing(c, false, schedDomainDir); err != nil { + if err := setCPUSLoadBalancingWithRetry(ctx, c, false); err != nil { return errors.Wrap(err, "set CPU load balancing") } } @@ -112,7 +113,7 @@ func (h *HighPerformanceHooks) PreStop(ctx context.Context, c *oci.Container, s // enable the CPU load balancing for the container CPUs if shouldCPULoadBalancingBeDisabled(s.Annotations()) { - if err := setCPUSLoadBalancing(c, true, schedDomainDir); err != nil { + if err := setCPUSLoadBalancingWithRetry(ctx, c, true); err != nil { return errors.Wrap(err, "set CPU load balancing") } } @@ -172,6 +173,25 @@ func isContainerRequestWholeCPU(c *oci.Container) bool { return *(c.Spec().Linux.Resources.CPU.Shares)%1024 == 0 } +func setCPUSLoadBalancingWithRetry(ctx context.Context, c *oci.Container, enable bool) error { + log.Infof(ctx, "Disable cpu load balancing for container %q", c.ID()) + // it is possible to have errors during reading or writing to sched_domain files because + // that kernel rebuilds it with updated values + // the retry will not fix it for 100% but should reduce the possibility for failures to minimum + // TODO: re-visit once we will have some more acceptable cgroups hierarchy to disable CPU load balancing + // correctly via cgroups, see -https://bugzilla.redhat.com/show_bug.cgi?id=1946801 + return wait.PollImmediate(time.Second, 5*time.Second, func() (bool, error) { + if err := setCPUSLoadBalancing(c, enable, schedDomainDir); err != nil { + if os.IsNotExist(err) { + log.Errorf(ctx, "Failed to set CPU load balancing: %v", err) + return false, nil + } + return false, err + } + return true, nil + }) +} + func setCPUSLoadBalancing(c *oci.Container, enable bool, schedDomainDir string) error { lspec := c.Spec().Linux if lspec == nil ||