From 0e27f0ff6ce595031479e6b9c12bbbe8fc434ffe Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Wed, 16 Mar 2022 17:14:49 +0000 Subject: [PATCH 1/4] Add metrics for tracking live servers This should allow us to correlate the servers that drone thinks it knows about with those that GCP has --- cmd/drone-autoscaler/main.go | 6 +++-- engine/alloc.go | 2 ++ metrics/metrics.go | 44 ++++++++++++++++++++++++++++++++++++ metrics/server_delete.go | 15 +++++++----- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/cmd/drone-autoscaler/main.go b/cmd/drone-autoscaler/main.go index fad0bff1..9770c0bb 100644 --- a/cmd/drone-autoscaler/main.go +++ b/cmd/drone-autoscaler/main.go @@ -62,9 +62,11 @@ func main() { Fatalln("Invalid or missing hosting provider") } + collector := metrics.New() + // instruments the provider with prometheus metrics. provider = metrics.ServerCreate(provider) - provider = metrics.ServerDelete(provider) + provider = metrics.ServerDelete(provider, collector) db, err := store.Connect( conf.Database.Driver, @@ -95,7 +97,7 @@ func main() { conf, servers, provider, - metrics.New(), + collector, ) // diff --git a/engine/alloc.go b/engine/alloc.go index 22dff6db..97bc4838 100644 --- a/engine/alloc.go +++ b/engine/alloc.go @@ -123,5 +123,7 @@ func (a *allocator) allocate(ctx context.Context, server *autoscaler.Server) err return err } + a.metrics.RegisterKnownInstance(instance) + return nil } diff --git a/metrics/metrics.go b/metrics/metrics.go index 08bcfe37..c69d12dd 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -9,6 +9,8 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + + "github.com/drone/autoscaler" ) var noContext = context.Background() @@ -40,6 +42,10 @@ type Collector interface { // IncrServerSetupError keeps a count of errors encountered // when installing software on servers. IncrServerSetupError() + + RegisterKnownInstance(instance *autoscaler.Instance) + + UnregisterKnownInstance(instance *autoscaler.Instance) } // Prometheus is a Prometheus metrics collector. @@ -50,6 +56,7 @@ type Prometheus struct { countServerCreateErr prometheus.Counter countServerInitErr prometheus.Counter countServerSetupErr prometheus.Counter + knownInstance *prometheus.GaugeVec } // New returns a new Prometheus metrics provider. @@ -82,12 +89,23 @@ func New() *Prometheus { Name: "drone_server_install_errors_total", Help: "Total number of errors installing software on a server.", }) + p.knownInstance = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "drone_server_known_instance", + Help: "Known server instances.", + }, + []string{ + "name", + "provider", + "region", + "size", + }) prometheus.MustRegister(p.trackServerCreateTime) prometheus.MustRegister(p.trackServerInitTime) prometheus.MustRegister(p.trackServerSetupTime) prometheus.MustRegister(p.countServerCreateErr) prometheus.MustRegister(p.countServerInitErr) prometheus.MustRegister(p.countServerSetupErr) + prometheus.MustRegister(p.knownInstance) return p } @@ -135,6 +153,26 @@ func (m *Prometheus) IncrServerSetupError() { m.countServerSetupErr.Inc() } +// RegisterKnownInstance registers that we know about a server. +func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) { + m.knownInstance.With(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }).Set(1) +} + +// UnregisterKnownInstance forgets a server we once knew. +func (m *Prometheus) UnregisterKnownInstance(instance *autoscaler.Instance) { + m.knownInstance.Delete(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }) +} + // NopCollector provides a no-op metrics collector. type NopCollector struct{} @@ -163,3 +201,9 @@ func (*NopCollector) IncrServerInitError() {} // IncrServerSetupError keeps a count of errors encountered // when installing software on servers. func (*NopCollector) IncrServerSetupError() {} + +// RegisterKnownInstance registers that we know about a server. +func (*NopCollector) RegisterKnownInstance(instance *autoscaler.Instance) {} + +// UnregisterKnownInstance forgets a server we once knew. +func (*NopCollector) UnregisterKnownInstance(instance *autoscaler.Instance) {} diff --git a/metrics/server_delete.go b/metrics/server_delete.go index 3d60bb33..ccab6d17 100644 --- a/metrics/server_delete.go +++ b/metrics/server_delete.go @@ -12,7 +12,7 @@ import ( ) // ServerDelete provides metrics for servers deleted. -func ServerDelete(provider autoscaler.Provider) autoscaler.Provider { +func ServerDelete(provider autoscaler.Provider, collector Collector) autoscaler.Provider { created := prometheus.NewCounter(prometheus.CounterOpts{ Name: "drone_servers_deleted", Help: "Total number of servers deleted.", @@ -24,17 +24,19 @@ func ServerDelete(provider autoscaler.Provider) autoscaler.Provider { prometheus.MustRegister(created) prometheus.MustRegister(errors) return &providerWrapDestroy{ - Provider: provider, - created: created, - errors: errors, + Provider: provider, + collector: collector, + created: created, + errors: errors, } } // instruments the Provider to count server destroy events. type providerWrapDestroy struct { autoscaler.Provider - created prometheus.Counter - errors prometheus.Counter + collector Collector + created prometheus.Counter + errors prometheus.Counter } func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler.Instance) error { @@ -44,5 +46,6 @@ func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler. } else { p.errors.Add(1) } + p.collector.UnregisterKnownInstance(instance) return err } From b5f8ab0784db4a06a987521066bcd60fd3c7c101 Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Tue, 29 Mar 2022 18:21:56 +0100 Subject: [PATCH 2/4] Fix server delete tests Now we pass a collector to `ServerDelete()`, more metrics are in the registry and the ones we want are at the end. --- metrics/server_delete_test.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/metrics/server_delete_test.go b/metrics/server_delete_test.go index 8aee546e..c0e02c00 100644 --- a/metrics/server_delete_test.go +++ b/metrics/server_delete_test.go @@ -35,7 +35,8 @@ func TestServerDelete(t *testing.T) { provider.EXPECT().Destroy(noContext, instance).Times(3).Return(nil) provider.EXPECT().Destroy(noContext, instance).Return(errors.New("error")) - providerInst := ServerDelete(provider) + collector := New() + providerInst := ServerDelete(provider, collector) for i := 0; i < 3; i++ { err := providerInst.Destroy(noContext, instance) if err != nil { @@ -52,20 +53,20 @@ func TestServerDelete(t *testing.T) { t.Error(err) return } - if want, got := len(metrics), 2; want != got { - t.Errorf("Expect registered metric") + if want, got := len(metrics), 8; want != got { + t.Errorf("Expect registered metric %d, got %d", want, got) return } - if got, want := metrics[0].GetName(), "drone_servers_deleted"; want != got { + if got, want := metrics[6].GetName(), "drone_servers_deleted"; want != got { t.Errorf("Expect metric name %s, got %s", want, got) } - if got, want := metrics[0].Metric[0].Counter.GetValue(), float64(3); want != got { + if got, want := metrics[6].Metric[0].Counter.GetValue(), float64(3); want != got { t.Errorf("Expect metric value %f, got %f", want, got) } - if got, want := metrics[1].GetName(), "drone_servers_deleted_err"; want != got { + if got, want := metrics[7].GetName(), "drone_servers_deleted_err"; want != got { t.Errorf("Expect metric name %s, got %s", want, got) } - if got, want := metrics[1].Metric[0].Counter.GetValue(), float64(1); want != got { + if got, want := metrics[7].Metric[0].Counter.GetValue(), float64(1); want != got { t.Errorf("Expect metric value %f, got %f", want, got) } } From 8ee38141e5726e1e18dc4846b3e7e164b0d88b6c Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Wed, 20 Jul 2022 18:02:29 +0100 Subject: [PATCH 3/4] Add feature flag DRONE_AUTOSCALER_REGISTER_KNOWN_SERVERS Only expose the new metrics when this variable is set --- metrics/metrics.go | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index c69d12dd..f92c96c8 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -6,6 +6,8 @@ package metrics import ( "context" + "os" + "strconv" "time" "github.com/prometheus/client_golang/prometheus" @@ -15,6 +17,16 @@ import ( var noContext = context.Background() +// this is a feature flag that can be used to enable +// metrics to track registering/unregistering of servers +var registerKnownServers = false + +func init() { + registerKnownServers, _ = strconv.ParseBool( + os.Getenv("DRONE_AUTOSCALER_REGISTER_KNOWN_SERVERS"), + ) +} + // Collector defines a metrics collector. type Collector interface { // TrackServerCreateTime registers the elapsed time it takes @@ -105,7 +117,9 @@ func New() *Prometheus { prometheus.MustRegister(p.countServerCreateErr) prometheus.MustRegister(p.countServerInitErr) prometheus.MustRegister(p.countServerSetupErr) - prometheus.MustRegister(p.knownInstance) + if registerKnownServers { + prometheus.MustRegister(p.knownInstance) + } return p } @@ -155,22 +169,26 @@ func (m *Prometheus) IncrServerSetupError() { // RegisterKnownInstance registers that we know about a server. func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) { - m.knownInstance.With(prometheus.Labels{ - "name": instance.Name, - "provider": string(instance.Provider), - "region": instance.Region, - "size": instance.Size, - }).Set(1) + if registerKnownServers { + m.knownInstance.With(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }).Set(1) + } } // UnregisterKnownInstance forgets a server we once knew. func (m *Prometheus) UnregisterKnownInstance(instance *autoscaler.Instance) { - m.knownInstance.Delete(prometheus.Labels{ - "name": instance.Name, - "provider": string(instance.Provider), - "region": instance.Region, - "size": instance.Size, - }) + if registerKnownServers { + m.knownInstance.Delete(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }) + } } // NopCollector provides a no-op metrics collector. From 68d4590f0486a721c873feffeb600dca5fe44fd1 Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Tue, 2 Aug 2022 16:27:39 +0100 Subject: [PATCH 4/4] Review feedback Rename to DRONE_METRICS_REGISTER_KNOWN_SERVERS, expose via config struct, reorder imports --- cmd/drone-autoscaler/main.go | 2 +- config/config.go | 4 ++++ metrics/metrics.go | 30 ++++++++++++------------------ metrics/server_delete_test.go | 3 ++- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/cmd/drone-autoscaler/main.go b/cmd/drone-autoscaler/main.go index 9770c0bb..f6116a49 100644 --- a/cmd/drone-autoscaler/main.go +++ b/cmd/drone-autoscaler/main.go @@ -62,7 +62,7 @@ func main() { Fatalln("Invalid or missing hosting provider") } - collector := metrics.New() + collector := metrics.New(conf) // instruments the provider with prometheus metrics. provider = metrics.ServerCreate(provider) diff --git a/config/config.go b/config/config.go index bdd34bcb..ab81f981 100644 --- a/config/config.go +++ b/config/config.go @@ -99,6 +99,10 @@ type ( Root string `envconfig:"DRONE_HTTP_ROOT" default:"/"` } + Metrics struct { + RegisterKnownServers bool `envconfig:"DRONE_METRICS_REGISTER_KNOWN_SERVERS" default:"false"` + } + UI struct { Username string `envconfig:"DRONE_UI_USERNAME"` Password string `envconfig:"DRONE_UI_PASSWORD"` diff --git a/metrics/metrics.go b/metrics/metrics.go index f92c96c8..ac70467b 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -6,27 +6,16 @@ package metrics import ( "context" - "os" - "strconv" "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/drone/autoscaler" + "github.com/drone/autoscaler/config" + + "github.com/prometheus/client_golang/prometheus" ) var noContext = context.Background() -// this is a feature flag that can be used to enable -// metrics to track registering/unregistering of servers -var registerKnownServers = false - -func init() { - registerKnownServers, _ = strconv.ParseBool( - os.Getenv("DRONE_AUTOSCALER_REGISTER_KNOWN_SERVERS"), - ) -} - // Collector defines a metrics collector. type Collector interface { // TrackServerCreateTime registers the elapsed time it takes @@ -69,11 +58,16 @@ type Prometheus struct { countServerInitErr prometheus.Counter countServerSetupErr prometheus.Counter knownInstance *prometheus.GaugeVec + + registerKnownServers bool } // New returns a new Prometheus metrics provider. -func New() *Prometheus { +func New(c config.Config) *Prometheus { p := new(Prometheus) + + p.registerKnownServers = c.Metrics.RegisterKnownServers + p.trackServerCreateTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "drone_server_create_time_seconds", Help: "Elapsed time creating a server.", @@ -117,7 +111,7 @@ func New() *Prometheus { prometheus.MustRegister(p.countServerCreateErr) prometheus.MustRegister(p.countServerInitErr) prometheus.MustRegister(p.countServerSetupErr) - if registerKnownServers { + if p.registerKnownServers { prometheus.MustRegister(p.knownInstance) } return p @@ -169,7 +163,7 @@ func (m *Prometheus) IncrServerSetupError() { // RegisterKnownInstance registers that we know about a server. func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) { - if registerKnownServers { + if m.registerKnownServers { m.knownInstance.With(prometheus.Labels{ "name": instance.Name, "provider": string(instance.Provider), @@ -181,7 +175,7 @@ func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) { // UnregisterKnownInstance forgets a server we once knew. func (m *Prometheus) UnregisterKnownInstance(instance *autoscaler.Instance) { - if registerKnownServers { + if m.registerKnownServers { m.knownInstance.Delete(prometheus.Labels{ "name": instance.Name, "provider": string(instance.Provider), diff --git a/metrics/server_delete_test.go b/metrics/server_delete_test.go index c0e02c00..46dac9f3 100644 --- a/metrics/server_delete_test.go +++ b/metrics/server_delete_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/drone/autoscaler" + "github.com/drone/autoscaler/config" "github.com/drone/autoscaler/mocks" "github.com/golang/mock/gomock" "github.com/prometheus/client_golang/prometheus" @@ -35,7 +36,7 @@ func TestServerDelete(t *testing.T) { provider.EXPECT().Destroy(noContext, instance).Times(3).Return(nil) provider.EXPECT().Destroy(noContext, instance).Return(errors.New("error")) - collector := New() + collector := New(config.Config{}) providerInst := ServerDelete(provider, collector) for i := 0; i < 3; i++ { err := providerInst.Destroy(noContext, instance)