From 0f3af93653aed0472360d1621e5f3ba774bedf3c Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 11:58:30 +0000 Subject: [PATCH 1/7] fix(cli/ssh): Avoid connection hang when workspace is stopped Two issues are addressed here: 1. We were not detecting disconnects due to waiting for Stdin to close (disconnect would only propagate after entering input and failing to write to the connection). 2. In other scenarios, where the connection drop is not detected, we now also watch workspace status and drop the connection when a workspace reaches the stopped state. Fixes: https://github.com/coder/jetbrains-coder/issues/199 Refs: #6180, #6175 --- cli/ssh.go | 57 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/cli/ssh.go b/cli/ssh.go index e1ebbcd04cfd2..3a5f721a82c75 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -100,17 +100,62 @@ func (r *RootCmd) ssh() *clibase.Cmd { stopPolling := tryPollWorkspaceAutostop(ctx, client, workspace) defer stopPolling() + // Enure connection is closed if the context is canceled or + // the workspace reaches the stopped state. + // + // Watching the stopped state is a work-around for cases + // where the agent is not gracefully shut down and the + // connection is left open. If, for instance, the networking + // is stopped before the agent is shut down, the disconnect + // will usually not propagate. + // + // See: https://github.com/coder/coder/issues/6180 + wsWatch, err := client.WatchWorkspace(ctx, workspace.ID) + if err != nil { + return err + } + watchAndClose := func(c io.Closer) { + // Ensure session is ended on both context cancellation + // and workspace stop. + defer c.Close() + + for { + select { + case <-ctx.Done(): + return + case w, ok := <-wsWatch: + if !ok { + return + } + + // Note, we only react to the stopped state here because we + // want to give the agent a chance to gracefully shut down + // during "stopping". + if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped { + _, _ = fmt.Fprintf(inv.Stderr, "Workspace %q has stopped. Closing connection.\r\n", workspace.Name) + return + } + } + } + } + if stdio { rawSSH, err := conn.SSH(ctx) if err != nil { return err } defer rawSSH.Close() + go watchAndClose(rawSSH) go func() { - _, _ = io.Copy(inv.Stdout, rawSSH) + // Ensure stdout copy closes incase stdin is closed + // unexpectedly. Typically we wouldn't worry about + // this since OpenSSH should kill the proxy command. + defer rawSSH.Close() + + _, _ = io.Copy(rawSSH, inv.Stdin) }() - _, _ = io.Copy(rawSSH, inv.Stdin) + _, _ = io.Copy(inv.Stdout, rawSSH) return nil } @@ -125,13 +170,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { return err } defer sshSession.Close() - - // Ensure context cancellation is propagated to the - // SSH session, e.g. to cancel `Wait()` at the end. - go func() { - <-ctx.Done() - _ = sshSession.Close() - }() + go watchAndClose(sshSession) if identityAgent == "" { identityAgent = os.Getenv("SSH_AUTH_SOCK") From b44db5a4e2093b3aae9156d5401548d2c419b96d Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 12:32:31 +0000 Subject: [PATCH 2/7] Close SSH client as well due to use of multiple sessions during forwarding --- cli/ssh.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cli/ssh.go b/cli/ssh.go index 3a5f721a82c75..2a831f25e9c31 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -114,10 +114,12 @@ func (r *RootCmd) ssh() *clibase.Cmd { if err != nil { return err } - watchAndClose := func(c io.Closer) { + watchAndClose := func(closer func() error) { // Ensure session is ended on both context cancellation // and workspace stop. - defer c.Close() + defer func() { + _ = closer() + }() for { select { @@ -145,7 +147,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { return err } defer rawSSH.Close() - go watchAndClose(rawSSH) + go watchAndClose(rawSSH.Close) go func() { // Ensure stdout copy closes incase stdin is closed @@ -170,7 +172,11 @@ func (r *RootCmd) ssh() *clibase.Cmd { return err } defer sshSession.Close() - go watchAndClose(sshSession) + go watchAndClose(func() error { + _ = sshSession.Close() + _ = sshClient.Close() + return nil + }) if identityAgent == "" { identityAgent = os.Getenv("SSH_AUTH_SOCK") From 918cf07d7cbb35cdc11702b8527290e9e21019b6 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 13:01:38 +0000 Subject: [PATCH 3/7] Allow reconnecting to coder server --- cli/ssh.go | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/cli/ssh.go b/cli/ssh.go index 2a831f25e9c31..fb0ce5fb57173 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -30,6 +30,7 @@ import ( "github.com/coder/coder/coderd/util/ptr" "github.com/coder/coder/codersdk" "github.com/coder/coder/cryptorand" + "github.com/coder/retry" ) var ( @@ -110,10 +111,6 @@ func (r *RootCmd) ssh() *clibase.Cmd { // will usually not propagate. // // See: https://github.com/coder/coder/issues/6180 - wsWatch, err := client.WatchWorkspace(ctx, workspace.ID) - if err != nil { - return err - } watchAndClose := func(closer func() error) { // Ensure session is ended on both context cancellation // and workspace stop. @@ -121,21 +118,42 @@ func (r *RootCmd) ssh() *clibase.Cmd { _ = closer() }() + startWatchLoop: for { - select { - case <-ctx.Done(): - return - case w, ok := <-wsWatch: - if !ok { + // (Re)connect to the coder server and watch workspace events. + var wsWatch <-chan codersdk.Workspace + for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { + wsWatch, err = client.WatchWorkspace(ctx, workspace.ID) + if err == nil { + break + } + if ctx.Err() != nil { return } + } - // Note, we only react to the stopped state here because we - // want to give the agent a chance to gracefully shut down - // during "stopping". - if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped { - _, _ = fmt.Fprintf(inv.Stderr, "Workspace %q has stopped. Closing connection.\r\n", workspace.Name) + for { + select { + case <-ctx.Done(): return + case w, ok := <-wsWatch: + if !ok { + continue startWatchLoop + } + + // Transitioning to stop or delete could mean that + // the agent will still gracefully stop. If a new + // build is starting, there's no reason to wait for + // the agent, it should be long gone. + if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart { + return + } + // Note, we only react to the stopped state here because we + // want to give the agent a chance to gracefully shut down + // during "stopping". + if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped { + return + } } } } From 0730da3c754a089d44b523d6757c7225a8d3db8f Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 13:23:03 +0000 Subject: [PATCH 4/7] Add tests to verify exit on workspace stop --- cli/ssh_test.go | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/cli/ssh_test.go b/cli/ssh_test.go index ec1dc1cb46b74..22c54e2c2d983 100644 --- a/cli/ssh_test.go +++ b/cli/ssh_test.go @@ -31,6 +31,7 @@ import ( "github.com/coder/coder/cli/clitest" "github.com/coder/coder/cli/cliui" "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/coderd/database" "github.com/coder/coder/codersdk" "github.com/coder/coder/codersdk/agentsdk" "github.com/coder/coder/provisioner/echo" @@ -143,6 +144,48 @@ func TestSSH(t *testing.T) { cancel() <-cmdDone }) + + t.Run("ExitOnStop", func(t *testing.T) { + t.Parallel() + + client, workspace, agentToken := setupWorkspaceForAgent(t, nil) + inv, root := clitest.New(t, "ssh", workspace.Name) + clitest.SetupConfig(t, client, root) + pty := ptytest.New(t).Attach(inv) + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + cmdDone := tGo(t, func() { + err := inv.WithContext(ctx).Run() + assert.Error(t, err) + }) + pty.ExpectMatch("Waiting") + + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(agentToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: slogtest.Make(t, nil).Named("agent"), + }) + defer func() { + _ = agentCloser.Close() + }() + + // Ensure the agent is connected. + pty.WriteLine("echo hell'o'") + pty.ExpectMatchContext(ctx, "hello") + + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + select { + case <-cmdDone: + case <-ctx.Done(): + require.Fail(t, "command did not exit in time") + } + }) + t.Run("Stdio", func(t *testing.T) { t.Parallel() client, workspace, agentToken := setupWorkspaceForAgent(t, nil) @@ -207,6 +250,77 @@ func TestSSH(t *testing.T) { <-cmdDone }) + + t.Run("StdioExitOnStop", func(t *testing.T) { + t.Parallel() + client, workspace, agentToken := setupWorkspaceForAgent(t, nil) + _, _ = tGoContext(t, func(ctx context.Context) { + // Run this async so the SSH command has to wait for + // the build and agent to connect! + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(agentToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: slogtest.Make(t, nil).Named("agent"), + }) + <-ctx.Done() + _ = agentCloser.Close() + }) + + clientOutput, clientInput := io.Pipe() + serverOutput, serverInput := io.Pipe() + defer func() { + for _, c := range []io.Closer{clientOutput, clientInput, serverOutput, serverInput} { + _ = c.Close() + } + }() + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + inv, root := clitest.New(t, "ssh", "--stdio", workspace.Name) + clitest.SetupConfig(t, client, root) + inv.Stdin = clientOutput + inv.Stdout = serverInput + inv.Stderr = io.Discard + cmdDone := tGo(t, func() { + err := inv.WithContext(ctx).Run() + assert.NoError(t, err) + }) + + conn, channels, requests, err := ssh.NewClientConn(&stdioConn{ + Reader: serverOutput, + Writer: clientInput, + }, "", &ssh.ClientConfig{ + // #nosec + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + }) + require.NoError(t, err) + defer conn.Close() + + sshClient := ssh.NewClient(conn, channels, requests) + defer sshClient.Close() + + session, err := sshClient.NewSession() + require.NoError(t, err) + defer session.Close() + + err = session.Shell() + require.NoError(t, err) + + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + select { + case <-cmdDone: + case <-ctx.Done(): + require.Fail(t, "command did not exit in time") + } + + err = session.Wait() + require.NoError(t, err) + }) + t.Run("ForwardAgent", func(t *testing.T) { if runtime.GOOS == "windows" { t.Skip("Test not supported on windows") From f876e84eac12fa63919d8030bbba6948b9becf04 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 13:28:49 +0000 Subject: [PATCH 5/7] Fix err race --- cli/ssh.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/ssh.go b/cli/ssh.go index fb0ce5fb57173..5b89ca9fc56ad 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -122,6 +122,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { for { // (Re)connect to the coder server and watch workspace events. var wsWatch <-chan codersdk.Workspace + var err error for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { wsWatch, err = client.WatchWorkspace(ctx, workspace.ID) if err == nil { From 3c93fa7cc47e99a2d3174ce3b03c353410942513 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 13:49:15 +0000 Subject: [PATCH 6/7] Fix test --- cli/ssh_test.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cli/ssh_test.go b/cli/ssh_test.go index 22c54e2c2d983..34675e150feda 100644 --- a/cli/ssh_test.go +++ b/cli/ssh_test.go @@ -177,7 +177,6 @@ func TestSSH(t *testing.T) { pty.ExpectMatchContext(ctx, "hello") workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) - coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) select { case <-cmdDone: @@ -309,16 +308,12 @@ func TestSSH(t *testing.T) { require.NoError(t, err) workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) - coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) select { case <-cmdDone: case <-ctx.Done(): require.Fail(t, "command did not exit in time") } - - err = session.Wait() - require.NoError(t, err) }) t.Run("ForwardAgent", func(t *testing.T) { From 767eb3b71dec37a2960c5e954aac7357251c62d1 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 19 Apr 2023 15:56:17 +0000 Subject: [PATCH 7/7] fix --- cli/ssh_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cli/ssh_test.go b/cli/ssh_test.go index 34675e150feda..ee544a328e2ea 100644 --- a/cli/ssh_test.go +++ b/cli/ssh_test.go @@ -147,6 +147,9 @@ func TestSSH(t *testing.T) { t.Run("ExitOnStop", func(t *testing.T) { t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("Windows doesn't seem to clean up the process, maybe #7100 will fix it") + } client, workspace, agentToken := setupWorkspaceForAgent(t, nil) inv, root := clitest.New(t, "ssh", workspace.Name) @@ -252,6 +255,9 @@ func TestSSH(t *testing.T) { t.Run("StdioExitOnStop", func(t *testing.T) { t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("Windows doesn't seem to clean up the process, maybe #7100 will fix it") + } client, workspace, agentToken := setupWorkspaceForAgent(t, nil) _, _ = tGoContext(t, func(ctx context.Context) { // Run this async so the SSH command has to wait for