diff --git a/utils/utils.go b/utils/utils.go index 34eca1c5fd1..0bb95c10da3 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -14,6 +14,7 @@ import ( "strconv" "strings" "syscall" + "time" "github.com/containers/podman/v3/pkg/lookup" "github.com/cri-o/cri-o/internal/dbusmgr" @@ -71,14 +72,24 @@ func RunUnderSystemdScope(mgr *dbusmgr.DbusConnManager, pid int, slice, unitName ch := make(chan string) if err := mgr.RetryOnDisconnect(func(c *systemdDbus.Conn) error { _, err = c.StartTransientUnit(unitName, "replace", properties, ch) - return err + return errors.Wrap(err, "start transient unit") }); err != nil { return err } // Block until job is started - <-ch - close(ch) + select { + case <-ch: + close(ch) + case <-time.After(time.Minute * 6): + // This case is a work around to catch situations where the dbus library sends the + // request but it unexpectedly disappears. We set the timeout very high to make sure + // we wait as long as possible to catch situations where dbus is overwhelmed. + // We also don't use the native context cancelling behavior of the dbus library, + // because experience has shown that it does not help. + // TODO: Find cause of the request being dropped in the dbus library and fix it. + return errors.Errorf("timed out moving conmon with pid %d to cgroup", pid) + } return nil }