From 4853de05af7f3104bdfe81d7ba7b219905c697eb Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 26 May 2022 18:42:34 -0700 Subject: [PATCH] utils/RunUnderSystemdScope: fix wrt channel deadlock As seen in [1], sometimes coreos/go-systemd/dbus package deadlocks: the jobCompete is stuck trying to send job result string to the channel while holding the jobListener lock, while startJob (called by StartTransientUnit) waits for the same lock. Alas, it is not clear why the channel is not being read, nor was I able to reproduce it locally. Make the job result channel buffered, so jobJistener won't block on channel send and thus StartTransientUnit won't be stuck either. While at it, - move the error wrapping out of mgr.RetryOnDisconnect function, and use fmt.Errorf with %w instead of obsoleted errors.Wrap; - improve error messages, printing the systemd unit name (so we can check it in systemd log); - do check the job result string -- in case it is not "done", return an error back to the caller, which should help avoid other issues down the line. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2082344 Signed-off-by: Kir Kolyshkin --- utils/utils.go | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/utils/utils.go b/utils/utils.go index 0ef5c19c455..6087a43ecfa 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -70,16 +70,24 @@ func RunUnderSystemdScope(mgr *dbusmgr.DbusConnManager, pid int, slice, unitName if slice != "" { properties = append(properties, systemdDbus.PropSlice(slice)) } - ch := make(chan string) + // Make a buffered channel so that the sender (go-systemd's jobComplete) + // won't be blocked on channel send while holding the jobListener lock + // (RHBZ#2082344). + ch := make(chan string, 1) if err := mgr.RetryOnDisconnect(func(c *systemdDbus.Conn) error { _, err = c.StartTransientUnitContext(ctx, unitName, "replace", properties, ch) - return errors.Wrap(err, "start transient unit") - }); err != nil { return err + }); err != nil { + return fmt.Errorf("start transient unit %q: %w", unitName, err) } - // Block until job is started + // Wait for the job status. select { + case s := <-ch: + close(ch) + if s != "done" { + return fmt.Errorf("error moving conmon with pid %d to systemd unit %s: got %s", pid, unitName, s) + } case <-ch: close(ch) case <-time.After(time.Minute * 6): @@ -89,7 +97,7 @@ func RunUnderSystemdScope(mgr *dbusmgr.DbusConnManager, pid int, slice, unitName // We also don't use the native context cancelling behavior of the dbus library, // because experience has shown that it does not help. // TODO: Find cause of the request being dropped in the dbus library and fix it. - return errors.Errorf("timed out moving conmon with pid %d to cgroup", pid) + return errors.Errorf("timed out moving conmon with pid %d to systemd unit %s", pid, unitName) } return nil