Thanks to visit codestin.com
Credit goes to github.com

Skip to content

chore: implement oom/ood processing component #16436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
6c6240b
chore: add workspace reached resource threshold notification
DanielleMaywood Jan 24, 2025
b3081de
chore: split out into two notifications
DanielleMaywood Jan 30, 2025
a9c8676
chore: update golden files
DanielleMaywood Jan 30, 2025
1a84f96
chore: begin impl of processing logic for oom/ood
DanielleMaywood Jan 29, 2025
78ede46
chore: appease the linter for now
DanielleMaywood Jan 29, 2025
0d2b970
chore: use latest changes to #247, start debounce logic
DanielleMaywood Jan 30, 2025
0df2fd5
chore: add more tests
DanielleMaywood Jan 30, 2025
854d81a
chore: remove mock db for workspace monitor agentapi test
DanielleMaywood Jan 30, 2025
9d9d7b4
chore: remove todo comment
DanielleMaywood Jan 30, 2025
4c21ce7
Merge branch 'main' into dm-internal-247
DanielleMaywood Feb 4, 2025
944fdb5
chore: rewrite ood notification
DanielleMaywood Feb 4, 2025
6444176
chore: updaten golden file
DanielleMaywood Feb 4, 2025
bc87268
chore: silly me
DanielleMaywood Feb 4, 2025
d2265f6
chore: rename test
DanielleMaywood Feb 4, 2025
82a9852
Merge branch 'dm-internal-247' into dm-internal-248
DanielleMaywood Feb 4, 2025
62621d4
chore: add more tests, fix broken sql query
DanielleMaywood Feb 4, 2025
81f43d3
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 4, 2025
7522b37
chore: update to match main
DanielleMaywood Feb 4, 2025
69c4f42
chore: run 'make gen'
DanielleMaywood Feb 4, 2025
44ebf65
chore: run 'make fmt'
DanielleMaywood Feb 5, 2025
714e743
chore: remove cruft
DanielleMaywood Feb 5, 2025
7cf5212
chore: align interface
DanielleMaywood Feb 5, 2025
d08e713
chore: add another test
DanielleMaywood Feb 6, 2025
ed42eae
chore: improve volume monitor test
DanielleMaywood Feb 6, 2025
1b0d0d2
chore: rename fields
DanielleMaywood Feb 11, 2025
51b16c6
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 11, 2025
4e43bab
chore: align with other branch
DanielleMaywood Feb 11, 2025
4e144ae
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 12, 2025
da25ecc
chore: bump migration number
DanielleMaywood Feb 12, 2025
fe1e805
chore: add test and align better
DanielleMaywood Feb 12, 2025
abbd522
chore: appease linter
DanielleMaywood Feb 12, 2025
456989e
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 14, 2025
1550cc6
chore: update rbac
DanielleMaywood Feb 14, 2025
7998f89
chore: handle missing datapoints
DanielleMaywood Feb 14, 2025
bda8f29
chore: add tests for unknown state on memory monitor
DanielleMaywood Feb 14, 2025
9d662a3
chore: add tests for missing datapoints in volume monitors
DanielleMaywood Feb 14, 2025
bff48dc
chore: add default debounce of 5 minutes
DanielleMaywood Feb 14, 2025
c343a70
chore: implement feedback
DanielleMaywood Feb 14, 2025
babc48f
chore: feedback
DanielleMaywood Feb 17, 2025
01ca549
chore: feedback
DanielleMaywood Feb 17, 2025
a975810
chore: forgot to run the linter
DanielleMaywood Feb 17, 2025
ee35d85
chore: use percentages for alert config
DanielleMaywood Feb 17, 2025
d2fa8df
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 17, 2025
27d78d1
chore: fmt and bump migration number
DanielleMaywood Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions coderd/agentapi/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ import (

"cdr.dev/slog"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/appearance"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/coderd/workspacestats"
Expand All @@ -29,6 +31,7 @@ import (
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/tailnet"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
)

// API implements the DRPC agent API interface from agent/proto. This struct is
Expand Down Expand Up @@ -59,7 +62,9 @@ type Options struct {

Ctx context.Context
Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer
Pubsub pubsub.Pubsub
DerpMapFn func() *tailcfg.DERPMap
TailnetCoordinator *atomic.Pointer[tailnet.Coordinator]
Expand All @@ -82,6 +87,10 @@ type Options struct {
}

func New(opts Options) *API {
if opts.Clock == nil {
opts.Clock = quartz.NewReal()
}

api := &API{
opts: opts,
mu: sync.Mutex{},
Expand All @@ -104,9 +113,22 @@ func New(opts Options) *API {
}

api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{
Log: opts.Log,
AgentID: opts.AgentID,
Database: opts.Database,
AgentID: opts.AgentID,
WorkspaceID: opts.WorkspaceID,
Clock: opts.Clock,
Database: opts.Database,
NotificationsEnqueuer: opts.NotificationsEnqueuer,
Debounce: 5 * time.Minute,

Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,

Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
}

api.StatsAPI = &StatsAPI{
Expand Down
207 changes: 198 additions & 9 deletions coderd/agentapi/resources_monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,35 @@ import (
"context"
"database/sql"
"errors"
"fmt"
"time"

"golang.org/x/xerrors"

"cdr.dev/slog"

"github.com/google/uuid"

"cdr.dev/slog"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/quartz"
)

type ResourcesMonitoringAPI struct {
AgentID uuid.UUID
Database database.Store
Log slog.Logger
AgentID uuid.UUID
WorkspaceID uuid.UUID

Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer

Debounce time.Duration
Config resourcesmonitor.Config
}

func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) {
Expand All @@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context

return &proto.GetResourcesMonitoringConfigurationResponse{
Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{
CollectionIntervalSeconds: 10,
NumDatapoints: 20,
CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()),
NumDatapoints: a.Config.NumDatapoints,
},
Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory {
if memoryErr != nil {
Expand All @@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
}

func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
a.Log.Info(ctx, "resources monitoring usage received",
slog.F("request", req))
var err error

if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr))
}

if volumeErr := a.monitorVolumes(ctx, req.Datapoints); volumeErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor volume: %w", volumeErr))
}

return &proto.PushResourcesMonitoringUsageResponse{}, err
}

func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
// It is valid for an agent to not have a memory monitor, so we
// do not want to treat it as an error.
if errors.Is(err, sql.ErrNoRows) {
return nil
}

return xerrors.Errorf("fetch memory resource monitor: %w", err)
}

if !monitor.Enabled {
return nil
}

usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
usageDatapoints = append(usageDatapoints, datapoint.Memory)
}

usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints)

oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)

debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)

//nolint:gocritic // We need to be able to update the resource monitor here.
err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
AgentID: a.AgentID,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
})
if err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}

if !shouldNotify {
return nil
}

workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}

_, err = a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfMemory,
map[string]string{
"workspace": workspace.Name,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
},
map[string]any{
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-memory",
)
if err != nil {
return xerrors.Errorf("notify workspace OOM: %w", err)
}

return nil
}

func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
return xerrors.Errorf("get or insert volume monitor: %w", err)
}

outOfDiskVolumes := make([]map[string]any, 0)

for _, monitor := range volumeMonitors {
if !monitor.Enabled {
continue
}

usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
var usage *proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage

for _, volume := range datapoint.Volumes {
if volume.Volume == monitor.Path {
usage = volume
break
}
}

usageDatapoints = append(usageDatapoints, usage)
}

usageStates := resourcesmonitor.CalculateVolumeUsageStates(monitor, usageDatapoints)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, given that we're having to do this for more memory & volumes, I think we should seriously consider updating the agent to send back that bool to indicate enabled but failed to collect; that's a 1:1 with your "unknown" logic here.

This can be in a follow-up.


oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)

debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)

if shouldNotify {
outOfDiskVolumes = append(outOfDiskVolumes, map[string]any{
"path": monitor.Path,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
})
}

//nolint:gocritic // We need to be able to update the resource monitor here.
if err := a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{
AgentID: a.AgentID,
Path: monitor.Path,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
}); err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}
}

if len(outOfDiskVolumes) == 0 {
return nil
}

workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}

if _, err := a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfDisk,
map[string]string{
"workspace": workspace.Name,
},
map[string]any{
"volumes": outOfDiskVolumes,
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-volumes",
); err != nil {
return xerrors.Errorf("notify workspace OOD: %w", err)
}

return &proto.PushResourcesMonitoringUsageResponse{}, nil
return nil
}
Loading
Loading