-
Notifications
You must be signed in to change notification settings - Fork 928
chore: implement oom/ood processing component #16436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
44 commits
Select commit
Hold shift + click to select a range
6c6240b
chore: add workspace reached resource threshold notification
DanielleMaywood b3081de
chore: split out into two notifications
DanielleMaywood a9c8676
chore: update golden files
DanielleMaywood 1a84f96
chore: begin impl of processing logic for oom/ood
DanielleMaywood 78ede46
chore: appease the linter for now
DanielleMaywood 0d2b970
chore: use latest changes to #247, start debounce logic
DanielleMaywood 0df2fd5
chore: add more tests
DanielleMaywood 854d81a
chore: remove mock db for workspace monitor agentapi test
DanielleMaywood 9d9d7b4
chore: remove todo comment
DanielleMaywood 4c21ce7
Merge branch 'main' into dm-internal-247
DanielleMaywood 944fdb5
chore: rewrite ood notification
DanielleMaywood 6444176
chore: updaten golden file
DanielleMaywood bc87268
chore: silly me
DanielleMaywood d2265f6
chore: rename test
DanielleMaywood 82a9852
Merge branch 'dm-internal-247' into dm-internal-248
DanielleMaywood 62621d4
chore: add more tests, fix broken sql query
DanielleMaywood 81f43d3
Merge branch 'main' into dm-internal-248
DanielleMaywood 7522b37
chore: update to match main
DanielleMaywood 69c4f42
chore: run 'make gen'
DanielleMaywood 44ebf65
chore: run 'make fmt'
DanielleMaywood 714e743
chore: remove cruft
DanielleMaywood 7cf5212
chore: align interface
DanielleMaywood d08e713
chore: add another test
DanielleMaywood ed42eae
chore: improve volume monitor test
DanielleMaywood 1b0d0d2
chore: rename fields
DanielleMaywood 51b16c6
Merge branch 'main' into dm-internal-248
DanielleMaywood 4e43bab
chore: align with other branch
DanielleMaywood 4e144ae
Merge branch 'main' into dm-internal-248
DanielleMaywood da25ecc
chore: bump migration number
DanielleMaywood fe1e805
chore: add test and align better
DanielleMaywood abbd522
chore: appease linter
DanielleMaywood 456989e
Merge branch 'main' into dm-internal-248
DanielleMaywood 1550cc6
chore: update rbac
DanielleMaywood 7998f89
chore: handle missing datapoints
DanielleMaywood bda8f29
chore: add tests for unknown state on memory monitor
DanielleMaywood 9d662a3
chore: add tests for missing datapoints in volume monitors
DanielleMaywood bff48dc
chore: add default debounce of 5 minutes
DanielleMaywood c343a70
chore: implement feedback
DanielleMaywood babc48f
chore: feedback
DanielleMaywood 01ca549
chore: feedback
DanielleMaywood a975810
chore: forgot to run the linter
DanielleMaywood ee35d85
chore: use percentages for alert config
DanielleMaywood d2fa8df
Merge branch 'main' into dm-internal-248
DanielleMaywood 27d78d1
chore: fmt and bump migration number
DanielleMaywood File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,20 +4,35 @@ import ( | |
"context" | ||
"database/sql" | ||
"errors" | ||
"fmt" | ||
"time" | ||
|
||
"golang.org/x/xerrors" | ||
|
||
"cdr.dev/slog" | ||
|
||
"github.com/google/uuid" | ||
|
||
"cdr.dev/slog" | ||
"github.com/coder/coder/v2/agent/proto" | ||
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor" | ||
"github.com/coder/coder/v2/coderd/database" | ||
"github.com/coder/coder/v2/coderd/database/dbauthz" | ||
"github.com/coder/coder/v2/coderd/database/dbtime" | ||
"github.com/coder/coder/v2/coderd/notifications" | ||
"github.com/coder/quartz" | ||
) | ||
|
||
type ResourcesMonitoringAPI struct { | ||
AgentID uuid.UUID | ||
Database database.Store | ||
Log slog.Logger | ||
AgentID uuid.UUID | ||
WorkspaceID uuid.UUID | ||
|
||
Log slog.Logger | ||
Clock quartz.Clock | ||
Database database.Store | ||
NotificationsEnqueuer notifications.Enqueuer | ||
|
||
Debounce time.Duration | ||
Config resourcesmonitor.Config | ||
} | ||
|
||
func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { | ||
|
@@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context | |
|
||
return &proto.GetResourcesMonitoringConfigurationResponse{ | ||
Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ | ||
CollectionIntervalSeconds: 10, | ||
NumDatapoints: 20, | ||
CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), | ||
NumDatapoints: a.Config.NumDatapoints, | ||
}, | ||
Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { | ||
if memoryErr != nil { | ||
|
@@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context | |
} | ||
|
||
func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) { | ||
a.Log.Info(ctx, "resources monitoring usage received", | ||
slog.F("request", req)) | ||
var err error | ||
|
||
if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil { | ||
err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr)) | ||
} | ||
|
||
if volumeErr := a.monitorVolumes(ctx, req.Datapoints); volumeErr != nil { | ||
err = errors.Join(err, xerrors.Errorf("monitor volume: %w", volumeErr)) | ||
} | ||
|
||
return &proto.PushResourcesMonitoringUsageResponse{}, err | ||
} | ||
|
||
func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { | ||
monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) | ||
DanielleMaywood marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if err != nil { | ||
// It is valid for an agent to not have a memory monitor, so we | ||
// do not want to treat it as an error. | ||
if errors.Is(err, sql.ErrNoRows) { | ||
return nil | ||
} | ||
|
||
return xerrors.Errorf("fetch memory resource monitor: %w", err) | ||
} | ||
|
||
if !monitor.Enabled { | ||
return nil | ||
} | ||
|
||
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage, 0, len(datapoints)) | ||
for _, datapoint := range datapoints { | ||
usageDatapoints = append(usageDatapoints, datapoint.Memory) | ||
} | ||
|
||
usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints) | ||
|
||
oldState := monitor.State | ||
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates) | ||
|
||
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState) | ||
|
||
//nolint:gocritic // We need to be able to update the resource monitor here. | ||
err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{ | ||
AgentID: a.AgentID, | ||
State: newState, | ||
UpdatedAt: dbtime.Time(a.Clock.Now()), | ||
DebouncedUntil: dbtime.Time(debouncedUntil), | ||
}) | ||
if err != nil { | ||
return xerrors.Errorf("update workspace monitor: %w", err) | ||
} | ||
|
||
if !shouldNotify { | ||
return nil | ||
} | ||
|
||
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID) | ||
if err != nil { | ||
return xerrors.Errorf("get workspace by id: %w", err) | ||
} | ||
|
||
_, err = a.NotificationsEnqueuer.EnqueueWithData( | ||
// nolint:gocritic // We need to be able to send the notification. | ||
dbauthz.AsNotifier(ctx), | ||
workspace.OwnerID, | ||
notifications.TemplateWorkspaceOutOfMemory, | ||
map[string]string{ | ||
"workspace": workspace.Name, | ||
"threshold": fmt.Sprintf("%d%%", monitor.Threshold), | ||
}, | ||
map[string]any{ | ||
// NOTE(DanielleMaywood): | ||
// When notifications are enqueued, they are checked to be | ||
// unique within a single day. This means that if we attempt | ||
// to send two OOM notifications for the same workspace on | ||
// the same day, the enqueuer will prevent us from sending | ||
// a second one. We are inject a timestamp to make the | ||
// notifications appear different enough to circumvent this | ||
// deduplication logic. | ||
"timestamp": a.Clock.Now(), | ||
}, | ||
"workspace-monitor-memory", | ||
) | ||
if err != nil { | ||
return xerrors.Errorf("notify workspace OOM: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { | ||
volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) | ||
if err != nil { | ||
return xerrors.Errorf("get or insert volume monitor: %w", err) | ||
} | ||
|
||
outOfDiskVolumes := make([]map[string]any, 0) | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for _, monitor := range volumeMonitors { | ||
if !monitor.Enabled { | ||
continue | ||
} | ||
|
||
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage, 0, len(datapoints)) | ||
for _, datapoint := range datapoints { | ||
var usage *proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage | ||
|
||
for _, volume := range datapoint.Volumes { | ||
if volume.Volume == monitor.Path { | ||
usage = volume | ||
break | ||
} | ||
} | ||
|
||
usageDatapoints = append(usageDatapoints, usage) | ||
} | ||
|
||
usageStates := resourcesmonitor.CalculateVolumeUsageStates(monitor, usageDatapoints) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second thought, given that we're having to do this for more memory & volumes, I think we should seriously consider updating the agent to send back that bool to indicate enabled but failed to collect; that's a 1:1 with your "unknown" logic here. This can be in a follow-up. |
||
|
||
oldState := monitor.State | ||
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates) | ||
|
||
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState) | ||
|
||
if shouldNotify { | ||
outOfDiskVolumes = append(outOfDiskVolumes, map[string]any{ | ||
"path": monitor.Path, | ||
"threshold": fmt.Sprintf("%d%%", monitor.Threshold), | ||
}) | ||
} | ||
|
||
//nolint:gocritic // We need to be able to update the resource monitor here. | ||
if err := a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{ | ||
AgentID: a.AgentID, | ||
Path: monitor.Path, | ||
State: newState, | ||
UpdatedAt: dbtime.Time(a.Clock.Now()), | ||
DebouncedUntil: dbtime.Time(debouncedUntil), | ||
}); err != nil { | ||
return xerrors.Errorf("update workspace monitor: %w", err) | ||
} | ||
} | ||
|
||
if len(outOfDiskVolumes) == 0 { | ||
return nil | ||
} | ||
|
||
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID) | ||
if err != nil { | ||
return xerrors.Errorf("get workspace by id: %w", err) | ||
} | ||
|
||
if _, err := a.NotificationsEnqueuer.EnqueueWithData( | ||
// nolint:gocritic // We need to be able to send the notification. | ||
dbauthz.AsNotifier(ctx), | ||
workspace.OwnerID, | ||
notifications.TemplateWorkspaceOutOfDisk, | ||
map[string]string{ | ||
"workspace": workspace.Name, | ||
}, | ||
map[string]any{ | ||
"volumes": outOfDiskVolumes, | ||
// NOTE(DanielleMaywood): | ||
// When notifications are enqueued, they are checked to be | ||
// unique within a single day. This means that if we attempt | ||
// to send two OOM notifications for the same workspace on | ||
// the same day, the enqueuer will prevent us from sending | ||
// a second one. We are inject a timestamp to make the | ||
// notifications appear different enough to circumvent this | ||
// deduplication logic. | ||
"timestamp": a.Clock.Now(), | ||
}, | ||
"workspace-monitor-volumes", | ||
); err != nil { | ||
return xerrors.Errorf("notify workspace OOD: %w", err) | ||
} | ||
|
||
return &proto.PushResourcesMonitoringUsageResponse{}, nil | ||
return nil | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.