From 0fcd543f4a57c00ad4eb047f0f33cc7a8312e4f1 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 1 Feb 2024 23:46:07 +0000 Subject: [PATCH 1/4] feat(coderd): add prometheus metrics to servertailnet --- coderd/coderd.go | 6 +++- coderd/tailnet.go | 54 +++++++++++++++++++++++++++- coderd/tailnet_test.go | 81 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 2 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 9d640e4b01778..94864971de36a 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -472,7 +472,7 @@ func New(options *Options) *API { api.Auditor.Store(&options.Auditor) api.TailnetCoordinator.Store(&options.TailnetCoordinator) - api.agentProvider, err = NewServerTailnet(api.ctx, + stn, err := NewServerTailnet(api.ctx, options.Logger, options.DERPServer, api.DERPMap, @@ -485,6 +485,10 @@ func New(options *Options) *API { if err != nil { panic("failed to setup server tailnet: " + err.Error()) } + api.agentProvider = stn + if options.DeploymentValues.Prometheus.Enable { + options.PrometheusRegistry.MustRegister(stn) + } api.TailnetClientService, err = tailnet.NewClientService( api.Logger.Named("tailnetclient"), &api.TailnetCoordinator, diff --git a/coderd/tailnet.go b/coderd/tailnet.go index fed86ab5aecb0..d3330dd7d2370 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" "tailscale.com/derp" @@ -97,6 +98,18 @@ func NewServerTailnet( agentConnectionTimes: map[uuid.UUID]time.Time{}, agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{}, transport: tailnetTransport.Clone(), + connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coder", + Subsystem: "servertailnet", + Name: "open_conns", + Help: "Total number of TCP connections currently open to workspace agents.", + }, []string{"agent_id"}), + totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coder", + Subsystem: "servertailnet", + Name: "total_conns", + Help: "Total number of TCP connections made to workspace agents.", + }, []string{"agent_id"}), } tn.transport.DialContext = tn.dialContext // These options are mostly just picked at random, and they can likely be @@ -170,6 +183,16 @@ func NewServerTailnet( return tn, nil } +func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) { + s.connsPerAgent.Describe(descs) + s.totalConns.Describe(descs) +} + +func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) { + s.connsPerAgent.Collect(metrics) + s.totalConns.Collect(metrics) +} + func (s *ServerTailnet) expireOldAgents() { const ( tick = 5 * time.Minute @@ -304,6 +327,9 @@ type ServerTailnet struct { agentTickets map[uuid.UUID]map[uuid.UUID]struct{} transport *http.Transport + + connsPerAgent *prometheus.GaugeVec + totalConns *prometheus.CounterVec } func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy { @@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) ( return nil, xerrors.Errorf("no agent id attached") } - return s.DialAgentNetConn(ctx, agentID, network, addr) + nc, err := s.DialAgentNetConn(ctx, agentID, network, addr) + if err != nil { + return nil, err + } + + s.connsPerAgent.With(prometheus.Labels{"agent_id": agentID.String()}).Inc() + s.totalConns.With(prometheus.Labels{"agent_id": agentID.String()}).Inc() + return &instrumentedConn{ + Conn: nc, + agentID: agentID, + connsPerAgent: s.connsPerAgent, + }, nil } func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error { @@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error { <-s.derpMapUpdaterClosed return nil } + +type instrumentedConn struct { + net.Conn + + agentID uuid.UUID + closeOnce sync.Once + connsPerAgent *prometheus.GaugeVec +} + +func (c *instrumentedConn) Close() error { + c.closeOnce.Do(func() { + c.connsPerAgent.With(prometheus.Labels{"agent_id": c.agentID.String()}).Dec() + }) + return c.Conn.Close() +} diff --git a/coderd/tailnet_test.go b/coderd/tailnet_test.go index cffe818424827..34bc0f18be7e7 100644 --- a/coderd/tailnet_test.go +++ b/coderd/tailnet_test.go @@ -13,6 +13,8 @@ import ( "testing" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -79,6 +81,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) { assert.Equal(t, http.StatusOK, res.StatusCode) }) + t.Run("Metrics", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + agents, serverTailnet := setupServerTailnetAgent(t, 1) + a := agents[0] + + registry := prometheus.NewRegistry() + require.NoError(t, registry.Register(serverTailnet)) + + u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort)) + require.NoError(t, err) + + rp := serverTailnet.ReverseProxy(u, u, a.id) + + rw := httptest.NewRecorder() + req := httptest.NewRequest( + http.MethodGet, + u.String(), + nil, + ).WithContext(ctx) + + rp.ServeHTTP(rw, req) + res := rw.Result() + defer res.Body.Close() + + assert.Equal(t, http.StatusOK, res.StatusCode) + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + return counterHasValue(t, metrics, 1, "coder_servertailnet_total_conns", a.id.String()) && + gaugeHasValue(t, metrics, 1, "coder_servertailnet_open_conns", a.id.String()) + }, testutil.WaitShort, testutil.IntervalFast) + }) + t.Run("HostRewrite", func(t *testing.T) { t.Parallel() @@ -328,3 +367,45 @@ func setupServerTailnetAgent(t *testing.T, agentNum int) ([]agentWithID, *coderd return agents, serverTailnet } + +func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetGauge().GetValue() + } + } + return false +} + +func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetCounter().GetValue() + } + } + return false +} From 66cb907e8b91064abb6b2cfa1977e391b166731c Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 5 Feb 2024 22:09:47 +0000 Subject: [PATCH 2/4] spike comments --- coderd/database/pubsub/pubsub_test.go | 79 +++++++-------------------- coderd/tailnet.go | 24 ++++---- coderd/tailnet_test.go | 47 +--------------- testutil/prometheus.go | 50 +++++++++++++++++ 4 files changed, 83 insertions(+), 117 deletions(-) create mode 100644 testutil/prometheus.go diff --git a/coderd/database/pubsub/pubsub_test.go b/coderd/database/pubsub/pubsub_test.go index e4012ad8eda38..9b2a495aeb091 100644 --- a/coderd/database/pubsub/pubsub_test.go +++ b/coderd/database/pubsub/pubsub_test.go @@ -6,7 +6,6 @@ import ( "testing" "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) { metrics, err := registry.Gather() require.NoError(t, err) - require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events")) - require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers")) + require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events")) + require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers")) event := "test" data := "testing" @@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) { require.Eventually(t, func() bool { metrics, err = registry.Gather() assert.NoError(t, err) - return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && - counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && - counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") && - counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total") + return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && + testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") && + testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total") }, testutil.WaitShort, testutil.IntervalFast) colossalData := make([]byte, 7600) @@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) { require.Eventually(t, func() bool { metrics, err = registry.Gather() assert.NoError(t, err) - return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && - gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && - counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") && - counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") && - counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") && - counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total") + return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && + testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && + testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") && + testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") && + testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total") }, testutil.WaitShort, testutil.IntervalFast) } - -func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue - } - } - return value == m.GetGauge().GetValue() - } - } - return false -} - -func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue - } - } - return value == m.GetCounter().GetValue() - } - } - return false -} diff --git a/coderd/tailnet.go b/coderd/tailnet.go index d3330dd7d2370..1d901d6372bd8 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -98,18 +98,18 @@ func NewServerTailnet( agentConnectionTimes: map[uuid.UUID]time.Time{}, agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{}, transport: tailnetTransport.Clone(), - connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + connsPerAgent: prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "coder", Subsystem: "servertailnet", - Name: "open_conns", + Name: "open_tcp_connections", Help: "Total number of TCP connections currently open to workspace agents.", - }, []string{"agent_id"}), - totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{ + }), + totalConns: prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "coder", Subsystem: "servertailnet", - Name: "total_conns", + Name: "tcp_connections_total", Help: "Total number of TCP connections made to workspace agents.", - }, []string{"agent_id"}), + }), } tn.transport.DialContext = tn.dialContext // These options are mostly just picked at random, and they can likely be @@ -328,8 +328,8 @@ type ServerTailnet struct { transport *http.Transport - connsPerAgent *prometheus.GaugeVec - totalConns *prometheus.CounterVec + connsPerAgent prometheus.Gauge + totalConns prometheus.Counter } func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy { @@ -380,8 +380,8 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) ( return nil, err } - s.connsPerAgent.With(prometheus.Labels{"agent_id": agentID.String()}).Inc() - s.totalConns.With(prometheus.Labels{"agent_id": agentID.String()}).Inc() + s.connsPerAgent.Inc() + s.totalConns.Inc() return &instrumentedConn{ Conn: nc, agentID: agentID, @@ -498,12 +498,12 @@ type instrumentedConn struct { agentID uuid.UUID closeOnce sync.Once - connsPerAgent *prometheus.GaugeVec + connsPerAgent prometheus.Gauge } func (c *instrumentedConn) Close() error { c.closeOnce.Do(func() { - c.connsPerAgent.With(prometheus.Labels{"agent_id": c.agentID.String()}).Dec() + c.connsPerAgent.Dec() }) return c.Conn.Close() } diff --git a/coderd/tailnet_test.go b/coderd/tailnet_test.go index 34bc0f18be7e7..65fda86778cd0 100644 --- a/coderd/tailnet_test.go +++ b/coderd/tailnet_test.go @@ -14,7 +14,6 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -113,8 +112,8 @@ func TestServerTailnet_ReverseProxy(t *testing.T) { require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) - return counterHasValue(t, metrics, 1, "coder_servertailnet_total_conns", a.id.String()) && - gaugeHasValue(t, metrics, 1, "coder_servertailnet_open_conns", a.id.String()) + return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_tcp_connections_total") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_tcp_connections") }, testutil.WaitShort, testutil.IntervalFast) }) @@ -367,45 +366,3 @@ func setupServerTailnetAgent(t *testing.T, agentNum int) ([]agentWithID, *coderd return agents, serverTailnet } - -func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - metricsLoop: - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue metricsLoop - } - } - return value == m.GetGauge().GetValue() - } - } - return false -} - -func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - metricsLoop: - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue metricsLoop - } - } - return value == m.GetCounter().GetValue() - } - } - return false -} diff --git a/testutil/prometheus.go b/testutil/prometheus.go new file mode 100644 index 0000000000000..3d4879c14c324 --- /dev/null +++ b/testutil/prometheus.go @@ -0,0 +1,50 @@ +package testutil + +import ( + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" +) + +func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetGauge().GetValue() + } + } + return false +} + +func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetCounter().GetValue() + } + } + return false +} From 02a7089173eed3ec46d107934591e8819fae6b5f Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Tue, 6 Feb 2024 05:34:37 +0000 Subject: [PATCH 3/4] network=tcp --- coderd/tailnet.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/coderd/tailnet.go b/coderd/tailnet.go index 1d901d6372bd8..74b821deb8cc6 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -98,18 +98,18 @@ func NewServerTailnet( agentConnectionTimes: map[uuid.UUID]time.Time{}, agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{}, transport: tailnetTransport.Clone(), - connsPerAgent: prometheus.NewGauge(prometheus.GaugeOpts{ + connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coder", Subsystem: "servertailnet", - Name: "open_tcp_connections", + Name: "open_connections", Help: "Total number of TCP connections currently open to workspace agents.", - }), - totalConns: prometheus.NewCounter(prometheus.CounterOpts{ + }, []string{"network"}), + totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coder", Subsystem: "servertailnet", - Name: "tcp_connections_total", + Name: "connections_total", Help: "Total number of TCP connections made to workspace agents.", - }), + }, []string{"network"}), } tn.transport.DialContext = tn.dialContext // These options are mostly just picked at random, and they can likely be @@ -328,8 +328,8 @@ type ServerTailnet struct { transport *http.Transport - connsPerAgent prometheus.Gauge - totalConns prometheus.Counter + connsPerAgent *prometheus.GaugeVec + totalConns *prometheus.CounterVec } func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy { @@ -380,8 +380,8 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) ( return nil, err } - s.connsPerAgent.Inc() - s.totalConns.Inc() + s.connsPerAgent.WithLabelValues("tcp").Inc() + s.totalConns.WithLabelValues("tcp").Inc() return &instrumentedConn{ Conn: nc, agentID: agentID, @@ -498,12 +498,12 @@ type instrumentedConn struct { agentID uuid.UUID closeOnce sync.Once - connsPerAgent prometheus.Gauge + connsPerAgent *prometheus.GaugeVec } func (c *instrumentedConn) Close() error { c.closeOnce.Do(func() { - c.connsPerAgent.Dec() + c.connsPerAgent.WithLabelValues("tcp").Dec() }) return c.Conn.Close() } From 202bd33374d3c600d55bb9dae200cfd9fdcfdb3a Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Tue, 6 Feb 2024 05:38:08 +0000 Subject: [PATCH 4/4] fixup! network=tcp --- coderd/tailnet_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/tailnet_test.go b/coderd/tailnet_test.go index 65fda86778cd0..73ccba701b632 100644 --- a/coderd/tailnet_test.go +++ b/coderd/tailnet_test.go @@ -112,8 +112,8 @@ func TestServerTailnet_ReverseProxy(t *testing.T) { require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) - return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_tcp_connections_total") && - testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_tcp_connections") + return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp") }, testutil.WaitShort, testutil.IntervalFast) })