@@ -10,6 +10,7 @@ import (
1010 "net/url"
1111 "os"
1212 "os/exec"
13+ "path"
1314 "path/filepath"
1415 "strings"
1516 "time"
@@ -23,6 +24,9 @@ import (
2324 "golang.org/x/term"
2425 "golang.org/x/xerrors"
2526
27+ "cdr.dev/slog"
28+ "cdr.dev/slog/sloggers/sloghuman"
29+
2630 "github.com/coder/coder/agent/agentssh"
2731 "github.com/coder/coder/cli/clibase"
2832 "github.com/coder/coder/cli/cliui"
@@ -46,6 +50,8 @@ func (r *RootCmd) ssh() *clibase.Cmd {
4650 identityAgent string
4751 wsPollInterval time.Duration
4852 noWait bool
53+ logDir string
54+ logToFile bool
4955 )
5056 client := new (codersdk.Client )
5157 cmd := & clibase.Cmd {
@@ -56,10 +62,44 @@ func (r *RootCmd) ssh() *clibase.Cmd {
5662 clibase .RequireNArgs (1 ),
5763 r .InitClient (client ),
5864 ),
59- Handler : func (inv * clibase.Invocation ) error {
65+ Handler : func (inv * clibase.Invocation ) ( retErr error ) {
6066 ctx , cancel := context .WithCancel (inv .Context ())
6167 defer cancel ()
6268
69+ logger := slog .Make () // empty logger
70+ defer func () {
71+ if retErr != nil {
72+ // catch and log all returned errors so we see them in the
73+ // log file (if there is one)
74+ logger .Error (ctx , "command exit" , slog .Error (retErr ))
75+ }
76+ }()
77+ if logToFile {
78+ // we need a way to ensure different ssh invocations don't clobber
79+ // each other's logs. Date-time strings will likely have collisions
80+ // in unit tests and/or scripts unless we extend precision out to
81+ // sub-millisecond, which seems unwieldy. A simple 5-character random
82+ // string will do it, since the operating system already tracks
83+ // dates and times for file IO.
84+ qual , err := cryptorand .String (5 )
85+ if err != nil {
86+ return xerrors .Errorf ("generate random qualifier: %w" , err )
87+ }
88+ logPth := path .Join (logDir , fmt .Sprintf ("coder-ssh-%s.log" , qual ))
89+ logFile , err := os .Create (logPth )
90+ if err != nil {
91+ return xerrors .Errorf ("error opening %s for logging: %w" , logPth , err )
92+ }
93+ logger = slog .Make (sloghuman .Sink (logFile ))
94+ defer logFile .Close ()
95+ if r .verbose {
96+ logger = logger .Leveled (slog .LevelDebug )
97+ }
98+
99+ // log HTTP requests
100+ client .Logger = logger
101+ }
102+
63103 workspace , workspaceAgent , err := getWorkspaceAndAgent (ctx , inv , client , codersdk .Me , inv .Args [0 ])
64104 if err != nil {
65105 return err
@@ -92,110 +132,71 @@ func (r *RootCmd) ssh() *clibase.Cmd {
92132 // We don't print the error because cliui.Agent does that for us.
93133 }
94134
95- conn , err := client .DialWorkspaceAgent (ctx , workspaceAgent .ID , & codersdk.DialWorkspaceAgentOptions {})
135+ conn , err := client .DialWorkspaceAgent (ctx , workspaceAgent .ID , & codersdk.DialWorkspaceAgentOptions {
136+ Logger : logger ,
137+ })
96138 if err != nil {
97- return err
139+ return xerrors . Errorf ( "dial agent: %w" , err )
98140 }
99141 defer conn .Close ()
100142 conn .AwaitReachable (ctx )
101143 stopPolling := tryPollWorkspaceAutostop (ctx , client , workspace )
102144 defer stopPolling ()
103145
104- // Enure connection is closed if the context is canceled or
105- // the workspace reaches the stopped state.
106- //
107- // Watching the stopped state is a work-around for cases
108- // where the agent is not gracefully shut down and the
109- // connection is left open. If, for instance, the networking
110- // is stopped before the agent is shut down, the disconnect
111- // will usually not propagate.
112- //
113- // See: https://github.com/coder/coder/issues/6180
114- watchAndClose := func (closer func () error ) {
115- // Ensure session is ended on both context cancellation
116- // and workspace stop.
117- defer func () {
118- _ = closer ()
119- }()
120-
121- startWatchLoop:
122- for {
123- // (Re)connect to the coder server and watch workspace events.
124- var wsWatch <- chan codersdk.Workspace
125- var err error
126- for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
127- wsWatch , err = client .WatchWorkspace (ctx , workspace .ID )
128- if err == nil {
129- break
130- }
131- if ctx .Err () != nil {
132- return
133- }
134- }
135-
136- for {
137- select {
138- case <- ctx .Done ():
139- return
140- case w , ok := <- wsWatch :
141- if ! ok {
142- continue startWatchLoop
143- }
144-
145- // Transitioning to stop or delete could mean that
146- // the agent will still gracefully stop. If a new
147- // build is starting, there's no reason to wait for
148- // the agent, it should be long gone.
149- if workspace .LatestBuild .ID != w .LatestBuild .ID && w .LatestBuild .Transition == codersdk .WorkspaceTransitionStart {
150- return
151- }
152- // Note, we only react to the stopped state here because we
153- // want to give the agent a chance to gracefully shut down
154- // during "stopping".
155- if w .LatestBuild .Status == codersdk .WorkspaceStatusStopped {
156- return
157- }
158- }
159- }
160- }
161- }
162-
163146 if stdio {
164147 rawSSH , err := conn .SSH (ctx )
165148 if err != nil {
166- return err
149+ return xerrors . Errorf ( "connect SSH: %w" , err )
167150 }
168151 defer rawSSH .Close ()
169- go watchAndClose (rawSSH .Close )
152+ go watchAndClose (ctx , rawSSH .Close , logger , client , workspace )
170153
171154 go func () {
172155 // Ensure stdout copy closes incase stdin is closed
173156 // unexpectedly. Typically we wouldn't worry about
174157 // this since OpenSSH should kill the proxy command.
175158 defer rawSSH .Close ()
176159
177- _ , _ = io .Copy (rawSSH , inv .Stdin )
160+ _ , err := io .Copy (rawSSH , inv .Stdin )
161+ if err != nil {
162+ logger .Error (ctx , "copy stdin error" , slog .Error (err ))
163+ } else {
164+ logger .Debug (ctx , "copy stdin complete" )
165+ }
178166 }()
179- _ , _ = io .Copy (inv .Stdout , rawSSH )
167+ _ , err = io .Copy (inv .Stdout , rawSSH )
168+ if err != nil {
169+ logger .Error (ctx , "copy stdout error" , slog .Error (err ))
170+ } else {
171+ logger .Debug (ctx , "copy stdout complete" )
172+ }
180173 return nil
181174 }
182175
183176 sshClient , err := conn .SSHClient (ctx )
184177 if err != nil {
185- return err
178+ return xerrors . Errorf ( "ssh client: %w" , err )
186179 }
187180 defer sshClient .Close ()
188181
189182 sshSession , err := sshClient .NewSession ()
190183 if err != nil {
191- return err
184+ return xerrors . Errorf ( "ssh session: %w" , err )
192185 }
193186 defer sshSession .Close ()
194- go watchAndClose (func () error {
195- _ = sshSession .Close ()
196- _ = sshClient .Close ()
197- return nil
198- })
187+ go watchAndClose (
188+ ctx ,
189+ func () error {
190+ err := sshSession .Close ()
191+ logger .Debug (ctx , "session close" , slog .Error (err ))
192+ err = sshClient .Close ()
193+ logger .Debug (ctx , "client close" , slog .Error (err ))
194+ return nil
195+ },
196+ logger ,
197+ client ,
198+ workspace ,
199+ )
199200
200201 if identityAgent == "" {
201202 identityAgent = os .Getenv ("SSH_AUTH_SOCK" )
@@ -257,7 +258,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
257258
258259 err = sshSession .RequestPty ("xterm-256color" , 128 , 128 , gossh.TerminalModes {})
259260 if err != nil {
260- return err
261+ return xerrors . Errorf ( "request pty: %w" , err )
261262 }
262263
263264 sshSession .Stdin = inv .Stdin
@@ -266,7 +267,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
266267
267268 err = sshSession .Shell ()
268269 if err != nil {
269- return err
270+ return xerrors . Errorf ( "start shell: %w" , err )
270271 }
271272
272273 // Put cancel at the top of the defer stack to initiate
@@ -289,7 +290,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
289290 if errors .Is (err , & gossh.ExitMissingError {}) {
290291 return xerrors .New ("SSH connection ended unexpectedly" )
291292 }
292- return err
293+ return xerrors . Errorf ( "session ended: %w" , err )
293294 }
294295
295296 return nil
@@ -335,10 +336,90 @@ func (r *RootCmd) ssh() *clibase.Cmd {
335336 Description : "Specifies whether to wait for a workspace to become ready before logging in (only applicable when the login before ready option has not been enabled). Note that the workspace agent may still be in the process of executing the startup script and the workspace may be in an incomplete state." ,
336337 Value : clibase .BoolOf (& noWait ),
337338 },
339+ {
340+ Flag : "log-dir" ,
341+ Default : os .TempDir (),
342+ Description : "Specify the location for the log files." ,
343+ Env : "CODER_SSH_LOG_DIR" ,
344+ Value : clibase .StringOf (& logDir ),
345+ },
346+ {
347+ Flag : "log-to-file" ,
348+ FlagShorthand : "l" ,
349+ Env : "CODER_SSH_LOG_TO_FILE" ,
350+ Description : "Enable diagnostic logging to file." ,
351+ Value : clibase .BoolOf (& logToFile ),
352+ },
338353 }
339354 return cmd
340355}
341356
357+ // watchAndClose ensures closer is called if the context is canceled or
358+ // the workspace reaches the stopped state.
359+ //
360+ // Watching the stopped state is a work-around for cases
361+ // where the agent is not gracefully shut down and the
362+ // connection is left open. If, for instance, the networking
363+ // is stopped before the agent is shut down, the disconnect
364+ // will usually not propagate.
365+ //
366+ // See: https://github.com/coder/coder/issues/6180
367+ func watchAndClose (ctx context.Context , closer func () error , logger slog.Logger , client * codersdk.Client , workspace codersdk.Workspace ) {
368+ // Ensure session is ended on both context cancellation
369+ // and workspace stop.
370+ defer func () {
371+ err := closer ()
372+ if err != nil {
373+ logger .Error (ctx , "error closing session" , slog .Error (err ))
374+ }
375+ }()
376+
377+ startWatchLoop:
378+ for {
379+ logger .Debug (ctx , "(re)connecting to the coder server to watch workspace events." )
380+ var wsWatch <- chan codersdk.Workspace
381+ var err error
382+ for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
383+ wsWatch , err = client .WatchWorkspace (ctx , workspace .ID )
384+ if err == nil {
385+ break
386+ }
387+ if ctx .Err () != nil {
388+ logger .Info (ctx , "context expired" , slog .Error (ctx .Err ()))
389+ return
390+ }
391+ }
392+
393+ for {
394+ select {
395+ case <- ctx .Done ():
396+ logger .Info (ctx , "context expired" , slog .Error (ctx .Err ()))
397+ return
398+ case w , ok := <- wsWatch :
399+ if ! ok {
400+ continue startWatchLoop
401+ }
402+
403+ // Transitioning to stop or delete could mean that
404+ // the agent will still gracefully stop. If a new
405+ // build is starting, there's no reason to wait for
406+ // the agent, it should be long gone.
407+ if workspace .LatestBuild .ID != w .LatestBuild .ID && w .LatestBuild .Transition == codersdk .WorkspaceTransitionStart {
408+ logger .Info (ctx , "new build started" )
409+ return
410+ }
411+ // Note, we only react to the stopped state here because we
412+ // want to give the agent a chance to gracefully shut down
413+ // during "stopping".
414+ if w .LatestBuild .Status == codersdk .WorkspaceStatusStopped {
415+ logger .Info (ctx , "workspace stopped" )
416+ return
417+ }
418+ }
419+ }
420+ }
421+ }
422+
342423// getWorkspaceAgent returns the workspace and agent selected using either the
343424// `<workspace>[.<agent>]` syntax via `in` or picks a random workspace and agent
344425// if `shuffle` is true.
0 commit comments