Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
3abc597
Create alert metrics db migration
spencerpauly Feb 2, 2022
6b8c132
remove closed_at, make tta nullable
dctalbot Feb 7, 2022
91b8e95
CREATE INDEX idx_closed_events
dctalbot Feb 7, 2022
e5fc386
new engine_processing_type
dctalbot Feb 7, 2022
70cbb25
new engine scaffold
dctalbot Feb 7, 2022
07cae19
add state column to engine_processing_versions
dctalbot Feb 7, 2022
e2c7173
update migration
KatieMSB Feb 7, 2022
92319f2
get max alertID and current state
KatieMSB Feb 7, 2022
4c26d02
use hard coded engine version
dctalbot Feb 8, 2022
8abe45e
set max alert ID if not set
dctalbot Feb 8, 2022
9fdfb49
find alerts for metrics
KatieMSB Feb 8, 2022
4969bd0
findAlerts -> insertAlertMetrics
dctalbot Feb 8, 2022
77103c1
findAlerts -> insertAlertMetrics
dctalbot Feb 8, 2022
a4ec308
fix escalated select
dctalbot Feb 14, 2022
76babf4
introduce findRecentAlert
dctalbot Feb 14, 2022
8852930
handle updating state
dctalbot Feb 14, 2022
1a50a6d
reset sliding window upon reaching min closed alert ID
dctalbot Feb 14, 2022
26733b7
handle null exception
dctalbot Feb 14, 2022
0673ff4
reorganize new migrations
mastercactapus Feb 14, 2022
3ec5b44
refactor
mastercactapus Feb 14, 2022
a782ca6
fix type error
mastercactapus Feb 14, 2022
e4dd98f
Merge branch 'master' into alert-metrics-new-engine
dctalbot Feb 23, 2022
2c2c795
bug fix for querying recently closed alerts
dctalbot Feb 23, 2022
d294cbd
remove fk constraint on service_id
dctalbot Feb 23, 2022
b23b773
add non-null service_id check to insertMetrics
dctalbot Feb 23, 2022
1a401d8
wip smoketest
dctalbot Feb 24, 2022
9d15153
fix test
dctalbot Feb 24, 2022
67c8a28
add id col
dctalbot Feb 24, 2022
e2dde7d
rm test (defer until graphql layer is implemented)
dctalbot Feb 24, 2022
fb3cc91
Merge branch 'master' into alert-metrics-new-engine
dctalbot Feb 24, 2022
1c6c842
fix migration order
mastercactapus Mar 7, 2022
f82bd84
Merge branch 'master' into alert-metrics-new-engine
dctalbot Mar 7, 2022
2ced023
fix empty alerts table edge case
dctalbot Mar 7, 2022
e01fa7b
use int type for state value
dctalbot Mar 7, 2022
59c925c
add null check
dctalbot Mar 7, 2022
b58a84d
short-circuit before loading state
dctalbot Mar 7, 2022
7b4e755
add drop index stmt
dctalbot Mar 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/target/goalert/engine/escalationmanager"
"github.com/target/goalert/engine/heartbeatmanager"
"github.com/target/goalert/engine/message"
"github.com/target/goalert/engine/metricsmanager"
"github.com/target/goalert/engine/npcyclemanager"
"github.com/target/goalert/engine/processinglock"
"github.com/target/goalert/engine/rotationmanager"
Expand Down Expand Up @@ -123,6 +124,10 @@ func NewEngine(ctx context.Context, db *sql.DB, c *Config) (*Engine, error) {
if err != nil {
return nil, errors.Wrap(err, "cleanup backend")
}
metricsMgr, err := metricsmanager.NewDB(ctx, db)
if err != nil {
return nil, errors.Wrap(err, "metrics management backend")
}

p.modules = []updater{
rotMgr,
Expand All @@ -133,6 +138,7 @@ func NewEngine(ctx context.Context, db *sql.DB, c *Config) (*Engine, error) {
verifyMgr,
hbMgr,
cleanMgr,
metricsMgr,
}

p.msg, err = message.NewDB(ctx, db, c.AlertLogStore, p.mgr)
Expand Down
75 changes: 75 additions & 0 deletions engine/metricsmanager/db.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package metricsmanager

import (
"context"
"database/sql"

"github.com/target/goalert/engine/processinglock"
"github.com/target/goalert/util"
)

const engineVersion = 1

// DB handles updating metrics
type DB struct {
db *sql.DB
lock *processinglock.Lock

highAlertID *sql.Stmt
lowAlertID *sql.Stmt

recentlyClosed *sql.Stmt
scanAlerts *sql.Stmt
insertMetrics *sql.Stmt
}

// Name returns the name of the module.
func (db *DB) Name() string { return "Engine.MetricsManager" }

// NewDB creates a new DB.
func NewDB(ctx context.Context, db *sql.DB) (*DB, error) {
lock, err := processinglock.NewLock(ctx, db, processinglock.Config{
Version: engineVersion,
Type: processinglock.TypeMetrics,
})
if err != nil {
return nil, err
}

p := &util.Prepare{Ctx: ctx, DB: db}

return &DB{
db: db,
lock: lock,

highAlertID: p.P(`select max(id) from alerts where status = 'closed'`),
lowAlertID: p.P(`select min(id) from alerts where status = 'closed'`),

recentlyClosed: p.P(`
select distinct log.alert_id
from alert_logs log
left join alert_metrics m on m.alert_id = log.alert_id
where m isnull and log.event = 'closed' and log.timestamp >= now() - '1 hour'::interval
limit 500
`),

scanAlerts: p.P(`
select a.id
from alerts a
left join alert_metrics m on m.alert_id = a.id
where m isnull and a.status = 'closed' and a.id between $1 and $2
`),

insertMetrics: p.P(`
insert into alert_metrics (alert_id, service_id, time_to_ack, time_to_close, escalated)
select
a.id,
a.service_id,
(select timestamp - a.created_at from alert_logs where alert_id = a.id and event = 'acknowledged' order by timestamp limit 1),
(select timestamp - a.created_at from alert_logs where alert_id = a.id and event = 'closed' order by timestamp limit 1),
(select count(*) > 1 from alert_logs where alert_id = a.id and event = 'escalated')
from alerts a
where a.id = any($1) and a.service_id is not null
`),
}, p.Err
}
139 changes: 139 additions & 0 deletions engine/metricsmanager/update.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package metricsmanager

import (
"context"
"database/sql"
"fmt"

"github.com/target/goalert/permission"
"github.com/target/goalert/util/log"
"github.com/target/goalert/util/sqlutil"
)

type State struct {
V1 struct {
NextAlertID int
}
}

/*
Theory of Operation:

1. Aquire processing lock
2. Look for recently closed alerts without a metrics entry
3. If any, insert metrics for them and exit
4. If no state, start scan from last closed alert id
5. If state, resume scan until min closed alert id

*/

// UpdateAll will update the alert metrics table
func (db *DB) UpdateAll(ctx context.Context) error {
err := permission.LimitCheckAny(ctx, permission.System)
if err != nil {
return err
}
log.Debugf(ctx, "Running metrics operations.")

tx, lockState, err := db.lock.BeginTxWithState(ctx, nil)
if err != nil {
return fmt.Errorf("begin tx: %w", err)
}
defer tx.Rollback()

rows, err := tx.StmtContext(ctx, db.recentlyClosed).QueryContext(ctx)
if err != nil {
return fmt.Errorf("query recently closed alerts: %w", err)
}
defer rows.Close()

var alertIDs []int
for rows.Next() {
var alertID int
err = rows.Scan(&alertID)
if err != nil {
return fmt.Errorf("scan alert id: %w", err)
}
alertIDs = append(alertIDs, alertID)
}

if len(alertIDs) > 0 {
_, err = tx.StmtContext(ctx, db.insertMetrics).ExecContext(ctx, sqlutil.IntArray(alertIDs))
if err != nil {
return fmt.Errorf("insert metrics: %w", err)
}
err = tx.Commit()
if err != nil {
return fmt.Errorf("commit: %w", err)
}
return nil
}

// fetch min alert id from db for later
var minAlertID sql.NullInt64
err = tx.StmtContext(ctx, db.lowAlertID).QueryRowContext(ctx).Scan(&minAlertID)
if err != nil {
return fmt.Errorf("query min alert id: %w", err)
}

if !minAlertID.Valid {
// no alerts
return nil
}

var state State
err = lockState.Load(ctx, &state)
if err != nil {
return fmt.Errorf("load state: %w", err)
}

if state.V1.NextAlertID == 0 || state.V1.NextAlertID < int(minAlertID.Int64) {
// no state, or reset, set to the highest alert id from the db
err = tx.StmtContext(ctx, db.highAlertID).QueryRowContext(ctx).Scan(&state.V1.NextAlertID)
if err != nil {
return fmt.Errorf("query high alert id: %w", err)
}
}

// clamp min alert ID 500 below next
if int(minAlertID.Int64) < state.V1.NextAlertID-500 {
minAlertID.Int64 = int64(state.V1.NextAlertID) - 500
}

// fetch alerts to update
rows, err = tx.StmtContext(ctx, db.scanAlerts).QueryContext(ctx, minAlertID, state.V1.NextAlertID)
if err != nil {
return fmt.Errorf("query alerts: %w", err)
}
defer rows.Close()

for rows.Next() {
var alertID int
err = rows.Scan(&alertID)
if err != nil {
return fmt.Errorf("scan alert id: %w", err)
}
alertIDs = append(alertIDs, alertID)
}

if len(alertIDs) > 0 {
_, err = tx.StmtContext(ctx, db.insertMetrics).ExecContext(ctx, sqlutil.IntArray(alertIDs))
if err != nil {
return fmt.Errorf("insert metrics: %w", err)
}
}

// update and save state
state.V1.NextAlertID = int(minAlertID.Int64) - 1
err = lockState.Save(ctx, &state)
if err != nil {
return fmt.Errorf("save state: %w", err)
}

err = tx.Commit()
if err != nil {
return fmt.Errorf("commit: %w", err)
}

return nil
}
5 changes: 5 additions & 0 deletions engine/processinglock/lock.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ type Lock struct {
db *sql.DB
lockStmt *sql.Stmt

loadState *sql.Stmt
saveState *sql.Stmt

advLockStmt *sql.Stmt
}
type txBeginner interface {
Expand All @@ -36,6 +39,8 @@ func NewLock(ctx context.Context, db *sql.DB, cfg Config) (*Lock, error) {
where type_id = $1
for update nowait
`),
loadState: p.P(`select state from engine_processing_versions where type_id = $1 for update nowait`),
saveState: p.P(`update engine_processing_versions set state = $2 where type_id = $1`),
}, p.Err
}

Expand Down
53 changes: 53 additions & 0 deletions engine/processinglock/state.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package processinglock

import (
"context"
"database/sql"
"encoding/json"

"github.com/target/goalert/util/jsonutil"
)

// State manages the state value for a processing lock.
type State struct {
data []byte
tx *sql.Tx
l *Lock
}

// BeginTxWithState will start a transaction, returning a State object.
func (l *Lock) BeginTxWithState(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, *State, error) {
tx, err := l.BeginTx(ctx, opts)
if err != nil {
return nil, nil, err
}

return tx, &State{tx: tx, l: l}, nil
}

// Load will load the JSON state from the database.
func (s *State) Load(ctx context.Context, v interface{}) error {
err := s.tx.StmtContext(ctx, s.l.loadState).QueryRowContext(ctx, s.l.cfg.Type).Scan(&s.data)
if err != nil {
return err
}

return json.Unmarshal(s.data, v)
}

// Save will save the JSON state to the database, taking care to ensure that
// existing unknown fields are preserved.
func (s *State) Save(ctx context.Context, v interface{}) error {
data, err := jsonutil.Apply(s.data, v)
if err != nil {
return err
}
s.data = data

_, err = s.tx.StmtContext(ctx, s.l.saveState).ExecContext(ctx, s.l.cfg.Type, s.data)
if err != nil {
return err
}

return nil
}
65 changes: 1 addition & 64 deletions engine/processinglock/type.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
package processinglock

import (
"database/sql/driver"
"fmt"
"github.com/target/goalert/validation/validate"
)

// Type indicates the lock type. For TypeMessage, the RegionID is used.
type Type string

Expand All @@ -20,62 +14,5 @@ const (
TypeVerify Type = "verify"
TypeMessage Type = "message"
TypeCleanup Type = "cleanup"
TypeMetrics Type = "metrics"
)

func (t Type) validate() error {
return validate.OneOf("Type", t,
TypeEscalation,
TypeHeartbeat,
TypeNPCycle,
TypeRotation,
TypeSchedule,
TypeStatusUpdate,
TypeVerify,
TypeMessage,
TypeCleanup,
)
}

// Value will return the DB enum value of the Type.
func (t Type) Value() (driver.Value, error) {
return string(t), t.validate()
}

// Scan will scan a DB enum value into Type.
func (t *Type) Scan(value interface{}) error {
switch _t := value.(type) {
case []byte:
*t = Type(_t)
case string:
*t = Type(_t)
default:
return fmt.Errorf("could not process unknown type for Type(%T)", t)
}
return t.validate()
}

// LockID returns the int value used for the advisory lock for the Type.
func (t Type) LockID() int {
switch t {
case TypeEscalation:
return 0x1000 // 4096
case TypeHeartbeat:
return 0x1010 // 4112
case TypeNPCycle:
return 0x1020 // 4128
case TypeRotation:
return 0x1030 // 4144
case TypeSchedule:
return 0x1040 // 4160
case TypeStatusUpdate:
return 0x1050 // 4176
case TypeVerify:
return 0x1060 // 4192
case TypeMessage:
return 0x1070 // 4208
case TypeCleanup:
return 0x1080 // 4224
}

panic("invalid type")
}
4 changes: 4 additions & 0 deletions migrate/migrations/20220307103153-add-metrics-proctype.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- +migrate Up notransaction
ALTER TYPE engine_processing_type ADD VALUE IF NOT EXISTS 'metrics';

-- +migrate Down
Loading