Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cb73754

Browse files
feat: add startup script logs to the ui (#6558)
* Add startup script logs to the database * Add coderd endpoints for startup script logs * Push startup script logs from agent * Pull startup script logs on frontend * Rename queries * Add constraint * Start creating log sending loop * Add log sending to the agent * Add tests for streaming logs * Shorten notify channel name * Add FE * Improve bulk log performance * Finish UI display * Fix startup log visibility * Add warning for overflow * Fix agent queue logs overflow * Display staartup logs in a virtual DOM for performance * Fix agent queue with loads of logs * Fix authorize test * Remove faulty test * Fix startup and shutdown reporting error * Fix gen * Fix comments * Periodically purge old database entries * Add test fixture for migration * Add Storybook * Check if there are logs when displaying features * Fix startup component overflow gap * Fix startup log wrapping --------- Co-authored-by: Asher <[email protected]>
1 parent a6fa8ca commit cb73754

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2514
-354
lines changed

agent/agent.go

Lines changed: 138 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"cdr.dev/slog"
4242
"github.com/coder/coder/agent/usershell"
4343
"github.com/coder/coder/buildinfo"
44+
"github.com/coder/coder/coderd/database"
4445
"github.com/coder/coder/coderd/gitauth"
4546
"github.com/coder/coder/codersdk"
4647
"github.com/coder/coder/codersdk/agentsdk"
@@ -88,6 +89,7 @@ type Client interface {
8889
PostLifecycle(ctx context.Context, state agentsdk.PostLifecycleRequest) error
8990
PostAppHealth(ctx context.Context, req agentsdk.PostAppHealthsRequest) error
9091
PostStartup(ctx context.Context, req agentsdk.PostStartupRequest) error
92+
PatchStartupLogs(ctx context.Context, req agentsdk.PatchStartupLogs) error
9193
}
9294

9395
func New(options Options) io.Closer {
@@ -642,13 +644,32 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
642644
}
643645

644646
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
645-
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
647+
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
646648
if err != nil {
647649
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
648650
}
649651
defer func() {
650-
_ = writer.Close()
652+
_ = fileWriter.Close()
651653
}()
654+
655+
var writer io.Writer = fileWriter
656+
if lifecycle == "startup" {
657+
// Create pipes for startup logs reader and writer
658+
logsReader, logsWriter := io.Pipe()
659+
defer func() {
660+
_ = logsReader.Close()
661+
}()
662+
writer = io.MultiWriter(fileWriter, logsWriter)
663+
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
664+
if err != nil {
665+
return xerrors.Errorf("track script logs: %w", err)
666+
}
667+
defer func() {
668+
_ = logsWriter.Close()
669+
<-flushedLogs
670+
}()
671+
}
672+
652673
cmd, err := a.createCommand(ctx, script, nil)
653674
if err != nil {
654675
return xerrors.Errorf("create command: %w", err)
@@ -664,10 +685,124 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
664685

665686
return xerrors.Errorf("run: %w", err)
666687
}
667-
668688
return nil
669689
}
670690

691+
func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan struct{}, error) {
692+
// Initialize variables for log management
693+
queuedLogs := make([]agentsdk.StartupLog, 0)
694+
var flushLogsTimer *time.Timer
695+
var logMutex sync.Mutex
696+
logsFlushed := sync.NewCond(&sync.Mutex{})
697+
var logsSending bool
698+
defer func() {
699+
logMutex.Lock()
700+
if flushLogsTimer != nil {
701+
flushLogsTimer.Stop()
702+
}
703+
logMutex.Unlock()
704+
}()
705+
706+
// sendLogs function uploads the queued logs to the server
707+
sendLogs := func() {
708+
// Lock logMutex and check if logs are already being sent
709+
logMutex.Lock()
710+
if logsSending {
711+
logMutex.Unlock()
712+
return
713+
}
714+
if flushLogsTimer != nil {
715+
flushLogsTimer.Stop()
716+
}
717+
if len(queuedLogs) == 0 {
718+
logMutex.Unlock()
719+
return
720+
}
721+
// Move the current queued logs to logsToSend and clear the queue
722+
logsToSend := queuedLogs
723+
logsSending = true
724+
queuedLogs = make([]agentsdk.StartupLog, 0)
725+
logMutex.Unlock()
726+
727+
// Retry uploading logs until successful or a specific error occurs
728+
for r := retry.New(time.Second, 5*time.Second); r.Wait(ctx); {
729+
err := a.client.PatchStartupLogs(ctx, agentsdk.PatchStartupLogs{
730+
Logs: logsToSend,
731+
})
732+
if err == nil {
733+
break
734+
}
735+
var sdkErr *codersdk.Error
736+
if errors.As(err, &sdkErr) {
737+
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
738+
a.logger.Warn(ctx, "startup logs too large, dropping logs")
739+
break
740+
}
741+
}
742+
a.logger.Error(ctx, "upload startup logs", slog.Error(err), slog.F("to_send", logsToSend))
743+
}
744+
// Reset logsSending flag
745+
logMutex.Lock()
746+
logsSending = false
747+
flushLogsTimer.Reset(100 * time.Millisecond)
748+
logMutex.Unlock()
749+
logsFlushed.Broadcast()
750+
}
751+
// queueLog function appends a log to the queue and triggers sendLogs if necessary
752+
queueLog := func(log agentsdk.StartupLog) {
753+
logMutex.Lock()
754+
defer logMutex.Unlock()
755+
756+
// Append log to the queue
757+
queuedLogs = append(queuedLogs, log)
758+
759+
// If there are more than 100 logs, send them immediately
760+
if len(queuedLogs) > 100 {
761+
// Don't early return after this, because we still want
762+
// to reset the timer just in case logs come in while
763+
// we're sending.
764+
go sendLogs()
765+
}
766+
// Reset or set the flushLogsTimer to trigger sendLogs after 100 milliseconds
767+
if flushLogsTimer != nil {
768+
flushLogsTimer.Reset(100 * time.Millisecond)
769+
return
770+
}
771+
flushLogsTimer = time.AfterFunc(100*time.Millisecond, sendLogs)
772+
}
773+
774+
// It's important that we either flush or drop all logs before returning
775+
// because the startup state is reported after flush.
776+
//
777+
// It'd be weird for the startup state to be ready, but logs are still
778+
// coming in.
779+
logsFinished := make(chan struct{})
780+
err := a.trackConnGoroutine(func() {
781+
scanner := bufio.NewScanner(reader)
782+
for scanner.Scan() {
783+
queueLog(agentsdk.StartupLog{
784+
CreatedAt: database.Now(),
785+
Output: scanner.Text(),
786+
})
787+
}
788+
defer close(logsFinished)
789+
logsFlushed.L.Lock()
790+
for {
791+
logMutex.Lock()
792+
if len(queuedLogs) == 0 {
793+
logMutex.Unlock()
794+
break
795+
}
796+
logMutex.Unlock()
797+
logsFlushed.Wait()
798+
}
799+
})
800+
if err != nil {
801+
return nil, xerrors.Errorf("track conn goroutine: %w", err)
802+
}
803+
return logsFinished, nil
804+
}
805+
671806
func (a *agent) init(ctx context.Context) {
672807
// Clients' should ignore the host key when connecting.
673808
// The agent needs to authenticate with coderd to SSH,

agent/agent_test.go

Lines changed: 89 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import (
88
"fmt"
99
"io"
1010
"net"
11+
"net/http"
12+
"net/http/httptest"
1113
"net/netip"
1214
"os"
1315
"os/exec"
@@ -31,15 +33,14 @@ import (
3133
"github.com/stretchr/testify/require"
3234
"go.uber.org/goleak"
3335
"golang.org/x/crypto/ssh"
34-
"golang.org/x/text/encoding/unicode"
35-
"golang.org/x/text/transform"
3636
"golang.org/x/xerrors"
3737
"tailscale.com/net/speedtest"
3838
"tailscale.com/tailcfg"
3939

4040
"cdr.dev/slog"
4141
"cdr.dev/slog/sloggers/slogtest"
4242
"github.com/coder/coder/agent"
43+
"github.com/coder/coder/coderd/httpapi"
4344
"github.com/coder/coder/codersdk"
4445
"github.com/coder/coder/codersdk/agentsdk"
4546
"github.com/coder/coder/pty/ptytest"
@@ -739,37 +740,78 @@ func TestAgent_SSHConnectionEnvVars(t *testing.T) {
739740

740741
func TestAgent_StartupScript(t *testing.T) {
741742
t.Parallel()
743+
output := "something"
744+
command := "sh -c 'echo " + output + "'"
742745
if runtime.GOOS == "windows" {
743-
t.Skip("This test doesn't work on Windows for some reason...")
746+
command = "cmd.exe /c echo " + output
744747
}
745-
content := "output"
746-
//nolint:dogsled
747-
_, _, _, fs, _ := setupAgent(t, agentsdk.Metadata{
748-
StartupScript: "echo " + content,
749-
}, 0)
750-
var gotContent string
751-
require.Eventually(t, func() bool {
752-
outputPath := filepath.Join(os.TempDir(), "coder-startup-script.log")
753-
content, err := afero.ReadFile(fs, outputPath)
754-
if err != nil {
755-
t.Logf("read file %q: %s", outputPath, err)
756-
return false
757-
}
758-
if len(content) == 0 {
759-
t.Logf("no content in %q", outputPath)
760-
return false
748+
t.Run("Success", func(t *testing.T) {
749+
t.Parallel()
750+
client := &client{
751+
t: t,
752+
agentID: uuid.New(),
753+
metadata: agentsdk.Metadata{
754+
StartupScript: command,
755+
DERPMap: &tailcfg.DERPMap{},
756+
},
757+
statsChan: make(chan *agentsdk.Stats),
758+
coordinator: tailnet.NewCoordinator(),
761759
}
762-
if runtime.GOOS == "windows" {
763-
// Windows uses UTF16! 🪟🪟🪟
764-
content, _, err = transform.Bytes(unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewDecoder(), content)
765-
if !assert.NoError(t, err) {
766-
return false
767-
}
760+
closer := agent.New(agent.Options{
761+
Client: client,
762+
Filesystem: afero.NewMemMapFs(),
763+
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
764+
ReconnectingPTYTimeout: 0,
765+
})
766+
t.Cleanup(func() {
767+
_ = closer.Close()
768+
})
769+
assert.Eventually(t, func() bool {
770+
got := client.getLifecycleStates()
771+
return len(got) > 0 && got[len(got)-1] == codersdk.WorkspaceAgentLifecycleReady
772+
}, testutil.WaitShort, testutil.IntervalMedium)
773+
774+
require.Len(t, client.getStartupLogs(), 1)
775+
require.Equal(t, output, client.getStartupLogs()[0].Output)
776+
})
777+
// This ensures that even when coderd sends back that the startup
778+
// script has written too many lines it will still succeed!
779+
t.Run("OverflowsAndSkips", func(t *testing.T) {
780+
t.Parallel()
781+
client := &client{
782+
t: t,
783+
agentID: uuid.New(),
784+
metadata: agentsdk.Metadata{
785+
StartupScript: command,
786+
DERPMap: &tailcfg.DERPMap{},
787+
},
788+
patchWorkspaceLogs: func() error {
789+
resp := httptest.NewRecorder()
790+
httpapi.Write(context.Background(), resp, http.StatusRequestEntityTooLarge, codersdk.Response{
791+
Message: "Too many lines!",
792+
})
793+
res := resp.Result()
794+
defer res.Body.Close()
795+
return codersdk.ReadBodyAsError(res)
796+
},
797+
statsChan: make(chan *agentsdk.Stats),
798+
coordinator: tailnet.NewCoordinator(),
768799
}
769-
gotContent = string(content)
770-
return true
771-
}, testutil.WaitShort, testutil.IntervalMedium)
772-
require.Equal(t, content, strings.TrimSpace(gotContent))
800+
closer := agent.New(agent.Options{
801+
Client: client,
802+
Filesystem: afero.NewMemMapFs(),
803+
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
804+
ReconnectingPTYTimeout: 0,
805+
})
806+
t.Cleanup(func() {
807+
_ = closer.Close()
808+
})
809+
assert.Eventually(t, func() bool {
810+
got := client.getLifecycleStates()
811+
return len(got) > 0 && got[len(got)-1] == codersdk.WorkspaceAgentLifecycleReady
812+
}, testutil.WaitShort, testutil.IntervalMedium)
813+
require.Len(t, client.getStartupLogs(), 0)
814+
})
773815
}
774816

775817
func TestAgent_Lifecycle(t *testing.T) {
@@ -1495,10 +1537,12 @@ type client struct {
14951537
statsChan chan *agentsdk.Stats
14961538
coordinator tailnet.Coordinator
14971539
lastWorkspaceAgent func()
1540+
patchWorkspaceLogs func() error
14981541

14991542
mu sync.Mutex // Protects following.
15001543
lifecycleStates []codersdk.WorkspaceAgentLifecycle
15011544
startup agentsdk.PostStartupRequest
1545+
logs []agentsdk.StartupLog
15021546
}
15031547

15041548
func (c *client) Metadata(_ context.Context) (agentsdk.Metadata, error) {
@@ -1583,6 +1627,22 @@ func (c *client) PostStartup(_ context.Context, startup agentsdk.PostStartupRequ
15831627
return nil
15841628
}
15851629

1630+
func (c *client) getStartupLogs() []agentsdk.StartupLog {
1631+
c.mu.Lock()
1632+
defer c.mu.Unlock()
1633+
return c.logs
1634+
}
1635+
1636+
func (c *client) PatchStartupLogs(_ context.Context, logs agentsdk.PatchStartupLogs) error {
1637+
c.mu.Lock()
1638+
defer c.mu.Unlock()
1639+
if c.patchWorkspaceLogs != nil {
1640+
return c.patchWorkspaceLogs()
1641+
}
1642+
c.logs = append(c.logs, logs.Logs...)
1643+
return nil
1644+
}
1645+
15861646
// tempDirUnixSocket returns a temporary directory that can safely hold unix
15871647
// sockets (probably).
15881648
//

cli/agent.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ func workspaceAgent() *cobra.Command {
118118
client := agentsdk.New(coderURL)
119119
client.SDK.Logger = logger
120120
// Set a reasonable timeout so requests can't hang forever!
121-
client.SDK.HTTPClient.Timeout = 10 * time.Second
121+
// The timeout needs to be reasonably long, because requests
122+
// with large payloads can take a bit. e.g. startup scripts
123+
// may take a while to insert.
124+
client.SDK.HTTPClient.Timeout = 30 * time.Second
122125

123126
// Enable pprof handler
124127
// This prevents the pprof import from being accidentally deleted.

cli/server.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ import (
6565
"github.com/coder/coder/coderd/autobuild/executor"
6666
"github.com/coder/coder/coderd/database"
6767
"github.com/coder/coder/coderd/database/dbfake"
68+
"github.com/coder/coder/coderd/database/dbpurge"
6869
"github.com/coder/coder/coderd/database/migrations"
6970
"github.com/coder/coder/coderd/devtunnel"
7071
"github.com/coder/coder/coderd/gitauth"
@@ -993,6 +994,10 @@ flags, and YAML configuration. The precedence is as follows:
993994
shutdownConnsCtx, shutdownConns := context.WithCancel(ctx)
994995
defer shutdownConns()
995996

997+
// Ensures that old database entries are cleaned up over time!
998+
purger := dbpurge.New(ctx, logger, options.Database)
999+
defer purger.Close()
1000+
9961001
// Wrap the server in middleware that redirects to the access URL if
9971002
// the request is not to a local IP.
9981003
var handler http.Handler = coderAPI.RootHandler

0 commit comments

Comments
 (0)