diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index bd1d5cddd43ed..415c7521ba202 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6149,6 +6149,7 @@ WHERE w.id IN ( AND b.template_version_id = t.active_version_id AND p.current_preset_id = $3::uuid AND p.ready + AND NOT t.deleted LIMIT 1 FOR UPDATE OF p SKIP LOCKED -- Ensure that a concurrent request will not select the same prebuild. ) RETURNING w.id, w.name @@ -6184,6 +6185,7 @@ FROM workspace_latest_builds wlb -- prebuilds that are still building. INNER JOIN templates t ON t.active_version_id = wlb.template_version_id WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status) + -- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running. GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id ` @@ -6298,6 +6300,7 @@ WITH filtered_builds AS ( WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. AND wlb.transition = 'start'::workspace_transition AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0' + AND NOT t.deleted ), time_sorted_builds AS ( -- Group builds by preset, then sort each group by created_at. @@ -6449,6 +6452,7 @@ FROM templates t INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id INNER JOIN organizations o ON o.id = t.organization_id WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. + -- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running. AND (t.id = $1::uuid OR $1 IS NULL) ` diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index 1d3a827c98586..8c27ddf62b7c3 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -15,6 +15,7 @@ WHERE w.id IN ( AND b.template_version_id = t.active_version_id AND p.current_preset_id = @preset_id::uuid AND p.ready + AND NOT t.deleted LIMIT 1 FOR UPDATE OF p SKIP LOCKED -- Ensure that a concurrent request will not select the same prebuild. ) RETURNING w.id, w.name; @@ -40,6 +41,7 @@ FROM templates t INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id INNER JOIN organizations o ON o.id = t.organization_id WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. + -- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running. AND (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL); -- name: GetRunningPrebuiltWorkspaces :many @@ -70,6 +72,7 @@ FROM workspace_latest_builds wlb -- prebuilds that are still building. INNER JOIN templates t ON t.active_version_id = wlb.template_version_id WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status) + -- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running. GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id; -- GetPresetsBackoff groups workspace builds by preset ID. @@ -98,6 +101,7 @@ WITH filtered_builds AS ( WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. AND wlb.transition = 'start'::workspace_transition AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0' + AND NOT t.deleted ), time_sorted_builds AS ( -- Group builds by preset, then sort each group by created_at. diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 9f1cc837d0474..7a7734b6f8093 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -157,6 +157,10 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { continue } + if preset.Deleted { + continue + } + presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) if err != nil { mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 07c3c3c6996bb..dce9e07dd110f 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -19,6 +19,7 @@ import ( "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/database/dbtime" agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/enterprise/coderd/prebuilds" @@ -165,27 +166,12 @@ func TestMetricsCollector(t *testing.T) { eligible: []bool{false}, }, { - name: "deleted templates never desire prebuilds", - transitions: allTransitions, - jobStatuses: allJobStatuses, - initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, - ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, - metrics: []metricCheck{ - {prebuilds.MetricDesiredGauge, ptr.To(0.0), false}, - }, - templateDeleted: []bool{true}, - eligible: []bool{false}, - }, - { - name: "running prebuilds for deleted templates are still counted, so that they can be deleted", - transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, - jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, - initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, - ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, - metrics: []metricCheck{ - {prebuilds.MetricRunningGauge, ptr.To(1.0), false}, - {prebuilds.MetricEligibleGauge, ptr.To(0.0), false}, - }, + name: "deleted templates should not be included in exported metrics", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: nil, templateDeleted: []bool{true}, eligible: []bool{false}, }, @@ -281,6 +267,12 @@ func TestMetricsCollector(t *testing.T) { "organization_name": org.Name, } + // If no expected metrics have been defined, ensure we don't find any metric series (i.e. metrics with given labels). + if test.metrics == nil { + series := findAllMetricSeries(metricsFamilies, labels) + require.Empty(t, series) + } + for _, check := range test.metrics { metric := findMetric(metricsFamilies, check.name, labels) if check.value == nil { @@ -307,6 +299,131 @@ func TestMetricsCollector(t *testing.T) { } } +// TestMetricsCollector_DuplicateTemplateNames validates a bug that we saw previously which caused duplicate metric series +// registration when a template was deleted and a new one created with the same name (and preset name). +// We are now excluding deleted templates from our metric collection. +func TestMetricsCollector_DuplicateTemplateNames(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("this test requires postgres") + } + + type metricCheck struct { + name string + value *float64 + isCounter bool + } + + type testCase struct { + transition database.WorkspaceTransition + jobStatus database.ProvisionerJobStatus + initiatorID uuid.UUID + ownerID uuid.UUID + metrics []metricCheck + eligible bool + } + + test := testCase{ + transition: database.WorkspaceTransitionStart, + jobStatus: database.ProvisionerJobStatusSucceeded, + initiatorID: agplprebuilds.SystemUserID, + ownerID: agplprebuilds.SystemUserID, + metrics: []metricCheck{ + {prebuilds.MetricCreatedCount, ptr.To(1.0), true}, + {prebuilds.MetricClaimedCount, ptr.To(0.0), true}, + {prebuilds.MetricFailedCount, ptr.To(0.0), true}, + {prebuilds.MetricDesiredGauge, ptr.To(1.0), false}, + {prebuilds.MetricRunningGauge, ptr.To(1.0), false}, + {prebuilds.MetricEligibleGauge, ptr.To(1.0), false}, + }, + eligible: true, + } + + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + clock := quartz.NewMock(t) + db, pubsub := dbtestutil.NewDB(t) + reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer()) + ctx := testutil.Context(t, testutil.WaitLong) + + collector := prebuilds.NewMetricsCollector(db, logger, reconciler) + registry := prometheus.NewPedanticRegistry() + registry.Register(collector) + + presetName := "default-preset" + defaultOrg := dbgen.Organization(t, db, database.Organization{}) + setupTemplateWithDeps := func() database.Template { + template := setupTestDBTemplateWithinOrg(t, db, test.ownerID, false, "default-template", defaultOrg) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, defaultOrg.ID, test.ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 1, "default-preset") + workspace, _ := setupTestDBWorkspace( + t, clock, db, pubsub, + test.transition, test.jobStatus, defaultOrg.ID, preset, template.ID, templateVersionID, test.initiatorID, test.ownerID, + ) + setupTestDBWorkspaceAgent(t, db, workspace.ID, test.eligible) + return template + } + + // When: starting with a regular template. + template := setupTemplateWithDeps() + labels := map[string]string{ + "template_name": template.Name, + "preset_name": presetName, + "organization_name": defaultOrg.Name, + } + + // nolint:gocritic // Authz context needed to retrieve state. + ctx = dbauthz.AsPrebuildsOrchestrator(ctx) + + // Then: metrics collect successfully. + require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong)) + metricsFamilies, err := registry.Gather() + require.NoError(t, err) + require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels)) + + // When: the template is deleted. + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + UpdatedAt: dbtime.Now(), + })) + + // Then: metrics collect successfully but are empty because the template is deleted. + require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong)) + metricsFamilies, err = registry.Gather() + require.NoError(t, err) + require.Empty(t, findAllMetricSeries(metricsFamilies, labels)) + + // When: a new template is created with the same name as the deleted template. + newTemplate := setupTemplateWithDeps() + + // Ensure the database has both the new and old (delete) template. + { + deleted, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{ + OrganizationID: template.OrganizationID, + Deleted: true, + Name: template.Name, + }) + require.NoError(t, err) + require.Equal(t, template.ID, deleted.ID) + + current, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{ + // Use details from deleted template to ensure they're aligned. + OrganizationID: template.OrganizationID, + Deleted: false, + Name: template.Name, + }) + require.NoError(t, err) + require.Equal(t, newTemplate.ID, current.ID) + } + + // Then: metrics collect successfully. + require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong)) + metricsFamilies, err = registry.Gather() + require.NoError(t, err) + require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels)) +} + func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric { for _, metricFamily := range metricsFamilies { if metricFamily.GetName() != name { @@ -334,3 +451,33 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, } return nil } + +// findAllMetricSeries finds all metrics with a given set of labels. +func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric { + series := make(map[string]*prometheus_client.Metric) + for _, metricFamily := range metricsFamilies { + for _, metric := range metricFamily.GetMetric() { + labelPairs := metric.GetLabel() + + if len(labelPairs) != len(labels) { + continue + } + + // Convert label pairs to map for easier lookup + metricLabels := make(map[string]string, len(labelPairs)) + for _, label := range labelPairs { + metricLabels[label.GetName()] = label.GetValue() + } + + // Check if all requested labels match + for wantName, wantValue := range labels { + if metricLabels[wantName] != wantValue { + continue + } + } + + series[metricFamily.GetName()] = metric + } + } + return series +} diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index bdf447dcfae22..660b1733e6cc9 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -294,10 +294,15 @@ func TestPrebuildReconciliation(t *testing.T) { templateDeleted: []bool{false}, }, { - name: "delete prebuilds for deleted templates", + // Templates can be soft-deleted (`deleted=true`) or hard-deleted (row is removed). + // On the former there is *no* DB constraint to prevent soft deletion, so we have to ensure that if somehow + // the template was soft-deleted any running prebuilds will be removed. + // On the latter there is a DB constraint to prevent row deletion if any workspaces reference the deleting template. + name: "soft-deleted templates MAY have prebuilds", prebuildLatestTransitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, prebuildJobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, templateVersionActive: []bool{true, false}, + shouldCreateNewPrebuild: ptr.To(false), shouldDeleteOldPrebuild: ptr.To(true), templateDeleted: []bool{true}, }, @@ -1060,6 +1065,33 @@ func setupTestDBTemplate( return org, template } +// nolint:revive // It's a control flag, but this is a test. +func setupTestDBTemplateWithinOrg( + t *testing.T, + db database.Store, + userID uuid.UUID, + templateDeleted bool, + templateName string, + org database.Organization, +) database.Template { + t.Helper() + + template := dbgen.Template(t, db, database.Template{ + Name: templateName, + CreatedBy: userID, + OrganizationID: org.ID, + CreatedAt: time.Now().Add(muchEarlier), + }) + if templateDeleted { + ctx := testutil.Context(t, testutil.WaitShort) + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + })) + } + return template +} + const ( earlier = -time.Hour muchEarlier = -time.Hour * 2