Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: add computed workspace and agent health fields to the api #8280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cli/testdata/coder_list_--output_json.golden
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
"ttl_ms": 28800000,
"last_used_at": "[timestamp]",
"deleting_at": null,
"locked_at": null
"locked_at": null,
"health": {
"healthy": true,
"failing_agents": []
}
}
]
49 changes: 49 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,24 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
workspaceAgent.ReadyAt = &dbAgent.ReadyAt.Time
}

switch {
case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff:
workspaceAgent.Health.Reason = "agent is not running"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be not in the loop, but what is the reason for returning the human-readable reason over API? Did you consider putting this mapping in the site/UI code?

Let's say that we want the site/UI to take an action on agent is taking too long to connect, does it mean that we have to compare strings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a valid concern. I suppose this method allows us to skip the introduction of a new enum and simply surface the reason in the UI. This is similar to how we do with API errors currently where we have a human readable reason in the HTTP response.

If we make this an enum, it needs to be kept up-to-date over UI, CLI, etc.

case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout:
workspaceAgent.Health.Reason = "agent is taking too long to connect"
case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected:
workspaceAgent.Health.Reason = "agent has lost connection"
// Note: We could also handle codersdk.WorkspaceAgentLifecycleStartTimeout
// here, but it's more of a soft issue, so we don't want to mark the agent
// as unhealthy.
case workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleStartError:
workspaceAgent.Health.Reason = "agent startup script exited with an error"
case workspaceAgent.LifecycleState.ShuttingDown():
workspaceAgent.Health.Reason = "agent is shutting down"
default:
workspaceAgent.Health.Healthy = true
}

return workspaceAgent, nil
}

Expand Down
3 changes: 3 additions & 0 deletions coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func TestWorkspaceAgent(t *testing.T) {
require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory)
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
require.NoError(t, err)
require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
})
t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -167,6 +168,8 @@ func TestWorkspaceAgent(t *testing.T) {
}, testutil.IntervalMedium, "agent status timeout")

require.Equal(t, wantTroubleshootingURL, workspace.LatestBuild.Resources[0].Agents[0].TroubleshootingURL)
require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
require.NotEmpty(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Reason)
})
}

Expand Down
13 changes: 13 additions & 0 deletions coderd/workspaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,15 @@ func convertWorkspace(
lockedAt = &workspace.LockedAt.Time
}

failingAgents := []uuid.UUID{}
for _, resource := range workspaceBuild.Resources {
for _, agent := range resource.Agents {
if !agent.Health.Healthy {
failingAgents = append(failingAgents, agent.ID)
}
}
}

var (
ttlMillis = convertWorkspaceTTLMillis(workspace.Ttl)
deletingAt = calculateDeletingAt(workspace, template, workspaceBuild)
Expand All @@ -1135,6 +1144,10 @@ func convertWorkspace(
LastUsedAt: workspace.LastUsedAt,
DeletingAt: deletingAt,
LockedAt: lockedAt,
Health: codersdk.WorkspaceHealth{
Healthy: len(failingAgents) == 0,
FailingAgents: failingAgents,
},
}
}

Expand Down
87 changes: 87 additions & 0 deletions coderd/workspaces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func TestWorkspace(t *testing.T) {
require.NoError(t, err)
require.Equal(t, user.UserID, ws.LatestBuild.InitiatorID)
require.Equal(t, codersdk.BuildReasonInitiator, ws.LatestBuild.Reason)
require.True(t, ws.Health.Healthy)
})

t.Run("Deleted", func(t *testing.T) {
Expand Down Expand Up @@ -164,6 +165,92 @@ func TestWorkspace(t *testing.T) {
assert.Equal(t, templateDisplayName, ws.TemplateDisplayName)
assert.Equal(t, templateAllowUserCancelWorkspaceJobs, ws.TemplateAllowUserCancelWorkspaceJobs)
})

t.Run("Health", func(t *testing.T) {
t.Parallel()

t.Run("Healthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

workspace, err := client.Workspace(ctx, workspace.ID)
require.NoError(t, err)
agent := workspace.LatestBuild.Resources[0].Agents[0]

assert.True(t, agent.Health.Healthy)
assert.Empty(t, agent.Health.Reason)
})

t.Run("Unhealthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
ConnectionTimeoutSeconds: 1,
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
t.Logf("%#v", workspace)
return assert.NoError(t, err) && !workspace.Health.Healthy
}, testutil.IntervalMedium)

agent := workspace.LatestBuild.Resources[0].Agents[0]

assert.False(t, workspace.Health.Healthy)
assert.Len(t, workspace.Health.FailingAgents, 1)
assert.False(t, agent.Health.Healthy)
assert.NotEmpty(t, agent.Health.Reason)
})
})
}

func TestAdminViewAllWorkspaces(t *testing.T) {
Expand Down
14 changes: 10 additions & 4 deletions codersdk/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,16 @@ type WorkspaceAgent struct {
ConnectionTimeoutSeconds int32 `json:"connection_timeout_seconds"`
TroubleshootingURL string `json:"troubleshooting_url"`
// Deprecated: Use StartupScriptBehavior instead.
LoginBeforeReady bool `json:"login_before_ready"`
ShutdownScript string `json:"shutdown_script,omitempty"`
ShutdownScriptTimeoutSeconds int32 `json:"shutdown_script_timeout_seconds"`
Subsystem AgentSubsystem `json:"subsystem"`
LoginBeforeReady bool `json:"login_before_ready"`
ShutdownScript string `json:"shutdown_script,omitempty"`
ShutdownScriptTimeoutSeconds int32 `json:"shutdown_script_timeout_seconds"`
Subsystem AgentSubsystem `json:"subsystem"`
Health WorkspaceAgentHealth `json:"health"` // Health reports the health of the agent.
}

type WorkspaceAgentHealth struct {
Healthy bool `json:"healthy" example:"false"` // Healthy is true if the agent is healthy.
Reason string `json:"reason,omitempty" example:"agent has lost connection"` // Reason is a human-readable explanation of the agent's health. It is empty if Healthy is true.
}

type DERPRegion struct {
Expand Down
8 changes: 8 additions & 0 deletions codersdk/workspaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ type Workspace struct {
// unlocked by an admin. It is subject to deletion if it breaches
// the duration of the locked_ttl field on its template.
LockedAt *time.Time `json:"locked_at" format:"date-time"`
// Health shows the health of the workspace and information about
// what is causing an unhealthy status.
Health WorkspaceHealth `json:"health"`
}

type WorkspaceHealth struct {
Healthy bool `json:"healthy" example:"false"` // Healthy is true if the workspace is healthy.
FailingAgents []uuid.UUID `json:"failing_agents" format:"uuid"` // FailingAgents lists the IDs of the agents that are failing, if any.
}

type WorkspacesRequest struct {
Expand Down
4 changes: 4 additions & 0 deletions docs/api/agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,10 @@ curl -X GET http://coder-server:8080/api/v2/workspaceagents/{workspaceagent} \
},
"expanded_directory": "string",
"first_connected_at": "2019-08-24T14:15:22Z",
"health": {
"healthy": false,
"reason": "agent has lost connection"
},
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
"instance_id": "string",
"last_connected_at": "2019-08-24T14:15:22Z",
Expand Down
Loading