package prebuilds

import (
	"context"
	"database/sql"
	"encoding/json"
	"errors"
	"fmt"
	"math"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/google/uuid"
	"github.com/hashicorp/go-multierror"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/trace"
	"golang.org/x/sync/errgroup"
	"golang.org/x/xerrors"

	"cdr.dev/slog/v3"
	"github.com/coder/coder/v2/coderd/audit"
	"github.com/coder/coder/v2/coderd/database"
	"github.com/coder/coder/v2/coderd/database/dbauthz"
	"github.com/coder/coder/v2/coderd/database/provisionerjobs"
	"github.com/coder/coder/v2/coderd/database/pubsub"
	"github.com/coder/coder/v2/coderd/files"
	"github.com/coder/coder/v2/coderd/notifications"
	"github.com/coder/coder/v2/coderd/prebuilds"
	"github.com/coder/coder/v2/coderd/rbac"
	"github.com/coder/coder/v2/coderd/rbac/policy"
	"github.com/coder/coder/v2/coderd/tracing"
	"github.com/coder/coder/v2/coderd/wsbuilder"
	"github.com/coder/coder/v2/codersdk"
	sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
	"github.com/coder/quartz"
)

type StoreReconciler struct {
	store             database.Store
	cfg               codersdk.PrebuildsConfig
	pubsub            pubsub.Pubsub
	fileCache         *files.Cache
	logger            slog.Logger
	clock             quartz.Clock
	registerer        prometheus.Registerer
	notifEnq          notifications.Enqueuer
	buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker]
	tracer            trace.Tracer

	cancelFn          context.CancelCauseFunc
	running           atomic.Bool
	stopped           atomic.Bool
	done              chan struct{}
	provisionNotifyCh chan database.ProvisionerJob

	reconciliationConcurrency int

	// Prebuild state metrics
	metrics *MetricsCollector
	// Operational metrics
	reconciliationDuration prometheus.Histogram
}

var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}

type DeprovisionMode int

const (
	DeprovisionModeNormal DeprovisionMode = iota
	DeprovisionModeOrphan
)

func (d DeprovisionMode) String() string {
	switch d {
	case DeprovisionModeOrphan:
		return "orphan"
	case DeprovisionModeNormal:
		return "normal"
	default:
		return "unknown"
	}
}

func NewStoreReconciler(store database.Store,
	ps pubsub.Pubsub,
	fileCache *files.Cache,
	cfg codersdk.PrebuildsConfig,
	logger slog.Logger,
	clock quartz.Clock,
	registerer prometheus.Registerer,
	notifEnq notifications.Enqueuer,
	buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker],
	tracerProvider trace.TracerProvider,
	maxDBConnections int,
) *StoreReconciler {
	reconciliationConcurrency := calculateReconciliationConcurrency(maxDBConnections)

	logger.Debug(context.Background(), "reconciler initialized",
		slog.F("reconciliation_concurrency", reconciliationConcurrency),
		slog.F("max_db_connections", maxDBConnections))

	reconciler := &StoreReconciler{
		store:                     store,
		pubsub:                    ps,
		fileCache:                 fileCache,
		logger:                    logger,
		cfg:                       cfg,
		clock:                     clock,
		registerer:                registerer,
		notifEnq:                  notifEnq,
		buildUsageChecker:         buildUsageChecker,
		tracer:                    tracerProvider.Tracer(tracing.TracerName),
		done:                      make(chan struct{}, 1),
		provisionNotifyCh:         make(chan database.ProvisionerJob, 10),
		reconciliationConcurrency: reconciliationConcurrency,
	}

	if registerer != nil {
		reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
		if err := registerer.Register(reconciler.metrics); err != nil {
			// If the registerer fails to register the metrics collector, it's not fatal.
			logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
		}

		factory := promauto.With(registerer)
		reconciler.reconciliationDuration = factory.NewHistogram(prometheus.HistogramOpts{
			Namespace: "coderd",
			Subsystem: "prebuilds",
			Name:      "reconciliation_duration_seconds",
			Help:      "Duration of each prebuilds reconciliation cycle.",
			Buckets:   prometheus.DefBuckets,
		})
	}

	return reconciler
}

// calculateReconciliationConcurrency determines the number of concurrent
// goroutines for preset reconciliation. Each preset may perform multiple
// database operations (creates/deletes), so we limit concurrency to avoid
// exhausting the connection pool while maintaining reasonable parallelism.
//
// Uses half the pool size, with a minimum of 1 and a maximum of 5.
// TODO(ssncferreira): If this becomes a bottleneck, consider adding a configuration option.
func calculateReconciliationConcurrency(maxDBConnections int) int {
	if maxDBConnections <= 0 {
		return 1
	}

	concurrency := maxDBConnections / 2
	if concurrency < 1 {
		return 1
	}
	if concurrency > 5 {
		return 5
	}

	return concurrency
}

func (c *StoreReconciler) Run(ctx context.Context) {
	reconciliationInterval := c.cfg.ReconciliationInterval.Value()
	if reconciliationInterval <= 0 { // avoids a panic
		reconciliationInterval = 5 * time.Minute
	}

	c.logger.Info(ctx, "starting reconciler",
		slog.F("interval", reconciliationInterval),
		slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
		slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()),
		slog.F("preset_concurrency", c.reconciliationConcurrency))

	var wg sync.WaitGroup
	ticker := c.clock.NewTicker(reconciliationInterval)
	defer ticker.Stop()
	defer func() {
		wg.Wait()
		c.done <- struct{}{}
	}()

	// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
	ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
	c.cancelFn = cancel

	// Start updating metrics in the background.
	if c.metrics != nil {
		wg.Add(1)
		go func() {
			defer wg.Done()
			c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
		}()
	}

	// Everything is in place, reconciler can now be considered as running.
	//
	// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.
	c.running.Store(true)

	// Publish provisioning jobs outside of database transactions.
	// A connection is held while a database transaction is active; PGPubsub also tries to acquire a new connection on
	// Publish, so we can exhaust available connections.
	//
	// A single worker dequeues from the channel, which should be sufficient.
	// If any messages are missed due to congestion or errors, provisionerdserver has a backup polling mechanism which
	// will periodically pick up any queued jobs (see poll(time.Duration) in coderd/provisionerdserver/acquirer.go).
	go func() {
		for {
			select {
			case <-c.done:
				return
			case <-ctx.Done():
				return
			case job := <-c.provisionNotifyCh:
				err := provisionerjobs.PostJob(c.pubsub, job)
				if err != nil {
					c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err))
				}
			}
		}
	}()

	for {
		select {
		// TODO: implement pubsub listener to allow reconciling a specific template imperatively once it has been changed,
		//		 instead of waiting for the next reconciliation interval
		case <-ticker.C:
			// Trigger a new iteration on each tick.
			stats, err := c.ReconcileAll(ctx)
			if err != nil {
				c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err))
			}

			if c.reconciliationDuration != nil {
				c.reconciliationDuration.Observe(stats.Elapsed.Seconds())
			}
			c.logger.Info(ctx, "reconciliation stats",
				slog.F("elapsed", stats.Elapsed),
				slog.F("presets_total", stats.PresetsTotal),
				slog.F("presets_reconciled", stats.PresetsReconciled),
			)
		case <-ctx.Done():
			// nolint:gocritic // it's okay to use slog.F() for an error in this case
			// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
			c.logger.Warn(
				context.Background(),
				"reconciliation loop exited",
				slog.Error(ctx.Err()),
				slog.F("cause", context.Cause(ctx)),
			)
			return
		}
	}
}

func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
	defer c.running.Store(false)

	if cause != nil {
		c.logger.Info(context.Background(), "stopping reconciler", slog.F("cause", cause.Error()))
	} else {
		c.logger.Info(context.Background(), "stopping reconciler")
	}

	// If previously stopped (Swap returns previous value), then short-circuit.
	//
	// NOTE: we need to *prospectively* mark this as stopped to prevent Stop being called multiple times and causing problems.
	if c.stopped.Swap(true) {
		return
	}

	// Unregister prebuilds state and operational metrics.
	if c.metrics != nil && c.registerer != nil {
		if !c.registerer.Unregister(c.metrics) {
			// The API doesn't allow us to know why the de-registration failed, but it's not very consequential.
			// The only time this would be an issue is if the premium license is removed, leading to the feature being
			// disabled (and consequently this Stop method being called), and then adding a new license which enables the
			// feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler.
			c.logger.Warn(context.Background(), "failed to unregister metrics collector")
		}
		if c.reconciliationDuration != nil {
			if !c.registerer.Unregister(c.reconciliationDuration) {
				c.logger.Warn(context.Background(), "failed to unregister reconciliation duration histogram")
			}
		}
	}

	// If the reconciler is not running, there's nothing else to do.
	if !c.running.Load() {
		return
	}

	if c.cancelFn != nil {
		c.cancelFn(cause)
	}

	select {
	// Give up waiting for control loop to exit.
	case <-ctx.Done():
		// nolint:gocritic // it's okay to use slog.F() for an error in this case
		// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
		c.logger.Error(
			context.Background(),
			"reconciler stop exited prematurely",
			slog.Error(ctx.Err()),
			slog.F("cause", context.Cause(ctx)),
		)
	// Wait for the control loop to exit.
	case <-c.done:
		c.logger.Info(context.Background(), "reconciler stopped")
	}
}

// ReconcileAll attempts to reconcile the desired vs actual state of all prebuilds for each
// (organization, template, template version, preset) tuple.
//
// The result is a set of provisioning actions for each preset. These actions are fire-and-forget:
// the reconciliation loop does not wait for prebuilt workspaces to complete provisioning.
//
// An outer read-only transaction holds an advisory lock ensuring only one replica reconciles at a time.
// This transaction remains open throughout the entire reconciliation cycle. Goroutines responsible for
// preset reconciliation use separate, independent write transactions (via c.store). In the rare case
// of the lock transaction failing mid-reconciliation, goroutines may continue while another replica
// acquires the lock, potentially causing temporary under/over-provisioning. Since the reconciliation
// loop is eventually consistent, subsequent cycles will converge to the desired state.
//
// NOTE: Read operations must use db (the lock transaction) while write operations must use c.store.
func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.ReconcileStats, err error) {
	ctx, span := c.tracer.Start(ctx, "prebuilds.ReconcileAll")
	defer span.End()

	start := c.clock.Now()
	defer func() {
		stats.Elapsed = c.clock.Since(start)
	}()

	logger := c.logger.With(slog.F("reconcile_context", "all"))

	select {
	case <-ctx.Done():
		logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err()))
		return stats, nil
	default:
	}

	logger.Debug(ctx, "starting reconciliation")

	err = c.WithReconciliationLock(ctx, logger, func(ctx context.Context, db database.Store) error {
		// Check if prebuilds reconciliation is paused
		// Use db (lock tx) for read-only operations
		settingsJSON, err := db.GetPrebuildsSettings(ctx)
		if err != nil {
			return xerrors.Errorf("get prebuilds settings: %w", err)
		}

		var settings codersdk.PrebuildsSettings
		if len(settingsJSON) > 0 {
			if err := json.Unmarshal([]byte(settingsJSON), &settings); err != nil {
				return xerrors.Errorf("unmarshal prebuilds settings: %w", err)
			}
		}

		if c.metrics != nil {
			c.metrics.setReconciliationPaused(settings.ReconciliationPaused)
		}

		if settings.ReconciliationPaused {
			logger.Info(ctx, "prebuilds reconciliation is paused, skipping reconciliation")
			return nil
		}

		// MembershipReconciler performs write operations, therefore it needs to use c.store
		// directly, since the lock transaction db is read-only.
		membershipReconciler := NewStoreMembershipReconciler(c.store, c.clock, logger)
		err = membershipReconciler.ReconcileAll(ctx, database.PrebuildsSystemUserID, PrebuiltWorkspacesGroupName)
		if err != nil {
			return xerrors.Errorf("reconcile prebuild membership: %w", err)
		}

		// Use db (lock tx) for read-only operations
		snapshot, err := c.SnapshotState(ctx, db)
		if err != nil {
			return xerrors.Errorf("determine current snapshot: %w", err)
		}

		c.reportHardLimitedPresets(snapshot)

		if len(snapshot.Presets) == 0 {
			logger.Debug(ctx, "no templates found with prebuilds configured")
			return nil
		}

		var eg errgroup.Group
		// Limit concurrency to avoid exhausting the coderd database connection pool.
		eg.SetLimit(c.reconciliationConcurrency)

		presetsReconciled := 0

		// Reconcile presets in parallel. Each preset in its own goroutine.
		for _, preset := range snapshot.Presets {
			ps, err := snapshot.FilterByPreset(preset.ID)
			if err != nil {
				logger.Warn(ctx, "failed to find preset snapshot", slog.Error(err), slog.F("preset_id", preset.ID.String()))
				continue
			}

			// Performance optimization: Skip presets that won't need any database operations.
			// This avoids holding a slot in the errgroup limiter, reserving capacity for
			// presets that actually need database connections.
			if ps.CanSkipReconciliation() {
				continue
			}

			presetsReconciled++

			eg.Go(func() error {
				// Pass outer context.
				err = c.ReconcilePreset(ctx, *ps)
				if err != nil {
					logger.Error(
						ctx,
						"failed to reconcile prebuilds for preset",
						slog.Error(err),
						slog.F("preset_id", preset.ID),
					)
				}
				// DO NOT return error otherwise the tx will end.
				return nil
			})
		}

		stats.PresetsTotal = len(snapshot.Presets)
		stats.PresetsReconciled = presetsReconciled

		// Release lock only when all preset reconciliation goroutines are finished.
		return eg.Wait()
	})
	if err != nil {
		logger.Error(ctx, "failed to reconcile", slog.Error(err))
	}

	return stats, err
}

func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
	// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
	// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
	// because templates can have multiple versions — or deleted templates can share the same name.
	presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
	for _, preset := range snapshot.Presets {
		key := hardLimitedPresetKey{
			orgName:      preset.OrganizationName,
			templateName: preset.TemplateName,
			presetName:   preset.Name,
		}

		presetsMap[key] = append(presetsMap[key], preset)
	}

	// Report a preset as hard-limited only if all the following conditions are met:
	// - The preset is marked as hard-limited
	// - The preset is using the active version of its template, and the template has not been deleted
	//
	// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
	// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
	// as hard-limited to the admin.
	//
	// This approach accounts for all relevant scenarios:
	// Scenario #1: The admin created a new template version with the same preset names.
	// Scenario #2: The admin created a new template version and renamed the presets.
	// Scenario #3: The admin deleted a template version that contained hard-limited presets.
	//
	// In all of these cases, only the latest and non-deleted presets will be reported.
	// All other presets will be ignored and eventually removed from Prometheus.
	isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
	for key, presets := range presetsMap {
		for _, preset := range presets {
			if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
				isPresetHardLimited[key] = true
				break
			}
		}
	}

	c.metrics.registerHardLimitedPresets(isPresetHardLimited)
}

// SnapshotState captures the current state of all prebuilds across templates.
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
	ctx, span := c.tracer.Start(ctx, "prebuilds.SnapshotState")
	defer span.End()

	if err := ctx.Err(); err != nil {
		return nil, err
	}

	var state prebuilds.GlobalSnapshot

	// If called with a store that is already in a transaction,
	// InTx will reuse that transaction rather than creating a new one.
	err := store.InTx(func(db database.Store) error {
		// TODO: implement template-specific reconciliations later
		presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, uuid.NullUUID{})
		if err != nil {
			return xerrors.Errorf("failed to get template presets with prebuilds: %w", err)
		}
		if len(presetsWithPrebuilds) == 0 {
			return nil
		}

		presetPrebuildSchedules, err := db.GetActivePresetPrebuildSchedules(ctx)
		if err != nil {
			return xerrors.Errorf("failed to get preset prebuild schedules: %w", err)
		}

		// Get results from both original and optimized queries for comparison
		allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
		if err != nil {
			return xerrors.Errorf("failed to get running prebuilds: %w", err)
		}

		allPrebuildsInProgress, err := db.CountInProgressPrebuilds(ctx)
		if err != nil {
			return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
		}

		allPendingPrebuilds, err := db.CountPendingNonActivePrebuilds(ctx)
		if err != nil {
			return xerrors.Errorf("failed to get pending prebuilds: %w", err)
		}

		presetsBackoff, err := db.GetPresetsBackoff(ctx, c.clock.Now().Add(-c.cfg.ReconciliationBackoffLookback.Value()))
		if err != nil {
			return xerrors.Errorf("failed to get backoffs for presets: %w", err)
		}

		hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, c.cfg.FailureHardLimit.Value())
		if err != nil {
			return xerrors.Errorf("failed to get hard limited presets: %w", err)
		}

		state = prebuilds.NewGlobalSnapshot(
			presetsWithPrebuilds,
			presetPrebuildSchedules,
			allRunningPrebuilds,
			allPrebuildsInProgress,
			allPendingPrebuilds,
			presetsBackoff,
			hardLimitedPresets,
			c.clock,
			c.logger,
		)
		return nil
	}, &database.TxOptions{
		Isolation:    sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
		ReadOnly:     true,
		TxIdentifier: "prebuilds.SnapshotState",
	})

	return &state, err
}

func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.PresetSnapshot) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.ReconcilePreset", trace.WithAttributes(
		attribute.String("preset_id", ps.Preset.ID.String()),
		attribute.String("preset_name", ps.Preset.Name),
		attribute.String("template_id", ps.Preset.TemplateID.String()),
		attribute.String("template_name", ps.Preset.TemplateName),
	))
	defer span.End()

	logger := c.logger.With(
		slog.F("template_id", ps.Preset.TemplateID.String()),
		slog.F("template_name", ps.Preset.TemplateName),
		slog.F("template_version_id", ps.Preset.TemplateVersionID),
		slog.F("template_version_name", ps.Preset.TemplateVersionName),
		slog.F("preset_id", ps.Preset.ID),
		slog.F("preset_name", ps.Preset.Name),
	)

	// If the preset reached the hard failure limit for the first time during this iteration:
	// - Mark it as hard-limited in the database
	// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
	if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
		logger.Warn(ctx, "preset is hard limited, notifying template admins")

		err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
			Status:   database.PrebuildStatusHardLimited,
			PresetID: ps.Preset.ID,
		})
		if err != nil {
			return xerrors.Errorf("failed to update preset prebuild status: %w", err)
		}
	}

	state := ps.CalculateState()
	actions, err := c.CalculateActions(ctx, ps)
	if err != nil {
		logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err))
		return err
	}

	fields := []slog.Field{
		slog.F("desired", state.Desired), slog.F("actual", state.Actual),
		slog.F("extraneous", state.Extraneous), slog.F("starting", state.Starting),
		slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting),
		slog.F("eligible", state.Eligible),
	}

	levelFn := logger.Debug
	levelFn(ctx, "calculated reconciliation state for preset", fields...)

	var multiErr multierror.Error
	for _, action := range actions {
		err = c.executeReconciliationAction(ctx, logger, ps, action)
		if err != nil {
			logger.Error(ctx, "failed to execute action", slog.F("type", action.ActionType), slog.Error(err))
			multiErr.Errors = append(multiErr.Errors, err)
		}
	}
	return multiErr.ErrorOrNil()
}

func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) ([]*prebuilds.ReconciliationActions, error) {
	if ctx.Err() != nil {
		return nil, ctx.Err()
	}

	return snapshot.CalculateActions(c.cfg.ReconciliationBackoffInterval.Value())
}

func (c *StoreReconciler) WithReconciliationLock(
	ctx context.Context,
	logger slog.Logger,
	fn func(ctx context.Context, db database.Store) error,
) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.WithReconciliationLock")
	defer span.End()

	// This tx holds a global lock, which prevents any other coderd replica from starting a reconciliation and
	// possibly getting an inconsistent view of the state.
	//
	// The lock MUST be held until ALL modifications have been effected.
	//
	// It is run with RepeatableRead isolation, so it's effectively snapshotting the data at the start of the tx.
	//
	// This is a read-only tx, so returning an error (i.e. causing a rollback) has no impact.
	return c.store.InTx(func(db database.Store) error {
		start := c.clock.Now()

		// Try to acquire the lock. If we can't get it, another replica is handling reconciliation.
		acquired, err := db.TryAcquireLock(ctx, database.LockIDReconcilePrebuilds)
		if err != nil {
			// This is a real database error, not just lock contention
			logger.Error(ctx, "failed to acquire reconciliation lock due to database error", slog.Error(err))
			return err
		}
		if !acquired {
			// Normal case: another replica has the lock
			span.SetAttributes(attribute.Bool("lock_acquired", false))
			return nil
		}
		span.SetAttributes(attribute.Bool("lock_acquired", true))

		logger.Debug(ctx,
			"acquired top-level reconciliation lock",
			slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", c.clock.Since(start).Seconds())),
		)

		return fn(ctx, db)
	}, &database.TxOptions{
		Isolation:    sql.LevelRepeatableRead,
		ReadOnly:     true,
		TxIdentifier: "prebuilds.WithReconciliationLock",
	})
}

// executeReconciliationAction executes a reconciliation action on the given preset snapshot.
//
// The action can be of different types (create, delete, backoff), and may internally include
// multiple items to process, for example, a delete action can contain multiple prebuild IDs to delete,
// and a create action includes a count of prebuilds to create.
//
// This method handles logging at appropriate levels and performs the necessary operations
// according to the action type. It returns an error if any part of the action fails.
func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logger slog.Logger, ps prebuilds.PresetSnapshot, action *prebuilds.ReconciliationActions) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.executeReconciliationAction", trace.WithAttributes(
		attribute.Int("action_type", int(action.ActionType)),
		attribute.Int("create_count", int(action.Create)),
		attribute.Int("delete_count", len(action.DeleteIDs)),
	))
	defer span.End()

	levelFn := logger.Debug

	// Nothing has to be done.
	if !ps.Preset.UsingActiveVersion && action.IsNoop() {
		logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done",
			slog.F("template_id", ps.Preset.TemplateID.String()), slog.F("template_name", ps.Preset.TemplateName),
			slog.F("template_version_id", ps.Preset.TemplateVersionID.String()), slog.F("template_version_name", ps.Preset.TemplateVersionName),
			slog.F("preset_id", ps.Preset.ID.String()), slog.F("preset_name", ps.Preset.Name))
		return nil
	}

	// nolint:gocritic // ReconcilePreset needs Prebuilds Orchestrator permissions.
	prebuildsCtx := dbauthz.AsPrebuildsOrchestrator(ctx)

	fields := []slog.Field{
		slog.F("action_type", action.ActionType), slog.F("create_count", action.Create),
		slog.F("delete_count", len(action.DeleteIDs)), slog.F("to_delete", action.DeleteIDs),
	}
	levelFn(ctx, "calculated reconciliation action for preset", fields...)

	switch {
	case action.ActionType == prebuilds.ActionTypeBackoff:
		levelFn = logger.Warn
	// Log at info level when there's a change to be effected.
	case action.ActionType == prebuilds.ActionTypeCreate && action.Create > 0:
		levelFn = logger.Info
	case action.ActionType == prebuilds.ActionTypeDelete && len(action.DeleteIDs) > 0:
		levelFn = logger.Info
	case action.ActionType == prebuilds.ActionTypeCancelPending:
		levelFn = logger.Info
	}

	switch action.ActionType {
	case prebuilds.ActionTypeBackoff:
		// If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out.
		levelFn(ctx, "template prebuild state retrieved, backing off",
			append(fields,
				slog.F("backoff_until", action.BackoffUntil.Format(time.RFC3339)),
				slog.F("backoff_secs", math.Round(action.BackoffUntil.Sub(c.clock.Now()).Seconds())),
			)...)

		return nil

	case prebuilds.ActionTypeCreate:
		// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
		// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
		// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
		desired := ps.CalculateDesiredInstances(c.clock.Now())

		if action.Create > desired {
			logger.Critical(ctx, "determined excessive count of prebuilds to create; clamping to desired count",
				slog.F("create_count", action.Create), slog.F("desired_count", desired))

			action.Create = desired
		}

		// If preset is hard-limited, and it's a create operation, log it and exit early.
		// Creation operation is disallowed for hard-limited preset.
		if ps.IsHardLimited && action.Create > 0 {
			logger.Warn(ctx, "skipping hard limited preset for create operation")
			return nil
		}

		var multiErr multierror.Error
		for range action.Create {
			if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
				logger.Error(ctx, "failed to create prebuild", slog.Error(err))
				multiErr.Errors = append(multiErr.Errors, err)
			}
		}

		return multiErr.ErrorOrNil()

	case prebuilds.ActionTypeDelete:
		var multiErr multierror.Error
		for _, id := range action.DeleteIDs {
			if err := c.deletePrebuiltWorkspace(prebuildsCtx, id, ps.Preset.TemplateID, ps.Preset.ID); err != nil {
				logger.Error(ctx, "failed to delete prebuild", slog.Error(err))
				multiErr.Errors = append(multiErr.Errors, err)
			}
		}

		return multiErr.ErrorOrNil()

	case prebuilds.ActionTypeCancelPending:
		return c.cancelAndOrphanDeletePendingPrebuilds(ctx, ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID)

	default:
		return xerrors.Errorf("unknown action type: %v", action.ActionType)
	}
}

func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.createPrebuiltWorkspace", trace.WithAttributes(
		attribute.String("prebuild_id", prebuiltWorkspaceID.String()),
		attribute.String("template_id", templateID.String()),
		attribute.String("preset_id", presetID.String()),
	))
	defer span.End()

	name, err := prebuilds.GenerateName()
	if err != nil {
		return xerrors.Errorf("failed to generate unique prebuild ID: %w", err)
	}

	var provisionerJob *database.ProvisionerJob
	err = c.store.InTx(func(db database.Store) error {
		template, err := db.GetTemplateByID(ctx, templateID)
		if err != nil {
			return xerrors.Errorf("failed to get template: %w", err)
		}

		now := c.clock.Now()

		minimumWorkspace, err := db.InsertWorkspace(ctx, database.InsertWorkspaceParams{
			ID:                prebuiltWorkspaceID,
			CreatedAt:         now,
			UpdatedAt:         now,
			OwnerID:           database.PrebuildsSystemUserID,
			OrganizationID:    template.OrganizationID,
			TemplateID:        template.ID,
			Name:              name,
			LastUsedAt:        c.clock.Now(),
			AutomaticUpdates:  database.AutomaticUpdatesNever,
			AutostartSchedule: sql.NullString{},
			Ttl:               sql.NullInt64{},
			NextStartAt:       sql.NullTime{},
		})
		if err != nil {
			return xerrors.Errorf("insert workspace: %w", err)
		}

		// We have to refetch the workspace for the joined in fields.
		workspace, err := db.GetWorkspaceByID(ctx, minimumWorkspace.ID)
		if err != nil {
			return xerrors.Errorf("get workspace by ID: %w", err)
		}

		c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
			slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))

		provisionerJob, err = c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
		return err
	}, &database.TxOptions{
		Isolation:    sql.LevelRepeatableRead,
		ReadOnly:     false,
		TxIdentifier: "prebuilds.createPrebuiltWorkspace",
	})
	if err != nil {
		return err
	}

	// Publish provisioner job event to notify the acquirer that a new job was posted
	c.publishProvisionerJob(ctx, provisionerJob, prebuiltWorkspaceID)

	return nil
}

// provisionDelete provisions a delete transition for a prebuilt workspace.
//
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
// This allows the workspace to be deleted even when no provisioners are available, and is safe
// when no Terraform resources were actually created (e.g., for pending prebuilds that were canceled
// before provisioning started).
//
// IMPORTANT: This function must be called within a database transaction. It does not create its own transaction.
// The caller is responsible for managing the transaction boundary via db.InTx().
func (c *StoreReconciler) provisionDelete(ctx context.Context, db database.Store, workspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID, mode DeprovisionMode) (*database.ProvisionerJob, error) {
	workspace, err := db.GetWorkspaceByID(ctx, workspaceID)
	if err != nil {
		return nil, xerrors.Errorf("get workspace by ID: %w", err)
	}

	template, err := db.GetTemplateByID(ctx, templateID)
	if err != nil {
		return nil, xerrors.Errorf("failed to get template: %w", err)
	}

	if workspace.OwnerID != database.PrebuildsSystemUserID {
		return nil, xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
	}

	c.logger.Info(ctx, "attempting to delete prebuild", slog.F("orphan", mode.String()),
		slog.F("name", workspace.Name), slog.F("workspace_id", workspaceID.String()), slog.F("preset_id", presetID.String()))

	return c.provision(ctx, db, workspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace, mode)
}

// cancelAndOrphanDeletePendingPrebuilds cancels pending prebuild jobs from inactive template versions
// and orphan-deletes their associated workspaces.
//
// The cancel operation uses a criteria-based update to ensure only jobs that are still pending at
// execution time are canceled, avoiding race conditions where jobs may have transitioned to running.
//
// Since these jobs were never processed by a provisioner, no Terraform resources were created,
// making it safe to orphan-delete the workspaces (skipping Terraform destroy).
func (c *StoreReconciler) cancelAndOrphanDeletePendingPrebuilds(ctx context.Context, templateID uuid.UUID, templateVersionID uuid.UUID, presetID uuid.UUID) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.cancelAndOrphanDeletePendingPrebuilds", trace.WithAttributes(
		attribute.String("template_id", templateID.String()),
		attribute.String("template_version_id", templateVersionID.String()),
		attribute.String("preset_id", presetID.String()),
	))
	defer span.End()

	var canceledProvisionerJob *database.ProvisionerJob
	var canceledWorkspaceID uuid.UUID
	err := c.store.InTx(func(db database.Store) error {
		canceledJobs, err := db.UpdatePrebuildProvisionerJobWithCancel(
			ctx,
			database.UpdatePrebuildProvisionerJobWithCancelParams{
				Now: c.clock.Now(),
				PresetID: uuid.NullUUID{
					UUID:  presetID,
					Valid: true,
				},
			})
		if err != nil {
			c.logger.Error(ctx, "failed to cancel pending prebuild jobs",
				slog.F("template_id", templateID.String()),
				slog.F("template_version_id", templateVersionID.String()),
				slog.F("preset_id", presetID.String()),
				slog.Error(err))
			return err
		}

		if len(canceledJobs) > 0 {
			c.logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
				slog.F("template_id", templateID.String()),
				slog.F("template_version_id", templateVersionID.String()),
				slog.F("preset_id", presetID.String()),
				slog.F("count", len(canceledJobs)))
		}

		var multiErr multierror.Error
		for _, job := range canceledJobs {
			provisionerJob, err := c.provisionDelete(ctx, db, job.WorkspaceID, job.TemplateID, presetID, DeprovisionModeOrphan)
			if err != nil {
				c.logger.Error(ctx, "failed to orphan delete canceled prebuild",
					slog.F("workspace_id", job.WorkspaceID.String()), slog.Error(err))
				multiErr.Errors = append(multiErr.Errors, err)
			} else if canceledProvisionerJob == nil {
				canceledProvisionerJob = provisionerJob
				canceledWorkspaceID = job.WorkspaceID
			}
		}

		return multiErr.ErrorOrNil()
	}, &database.TxOptions{
		Isolation:    sql.LevelRepeatableRead,
		ReadOnly:     false,
		TxIdentifier: "prebuilds.cancelAndOrphanDeletePendingPrebuilds",
	})
	if err != nil {
		return err
	}

	// Job event notifications contain organization, provisioner type, and tags.
	// Since all canceled jobs have the same values, we only send one notification
	// for the first successfully canceled job, which is sufficient to trigger the
	// provisioner chain that processes all remaining jobs.
	if canceledProvisionerJob != nil {
		c.publishProvisionerJob(ctx, canceledProvisionerJob, canceledWorkspaceID)
	}

	return nil
}

func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
	ctx, span := c.tracer.Start(ctx, "prebuilds.deletePrebuiltWorkspace", trace.WithAttributes(
		attribute.String("prebuild_id", prebuiltWorkspaceID.String()),
		attribute.String("template_id", templateID.String()),
		attribute.String("preset_id", presetID.String()),
	))
	defer span.End()

	var provisionerJob *database.ProvisionerJob
	err := c.store.InTx(func(db database.Store) (err error) {
		provisionerJob, err = c.provisionDelete(ctx, db, prebuiltWorkspaceID, templateID, presetID, DeprovisionModeNormal)
		return err
	}, &database.TxOptions{
		Isolation:    sql.LevelRepeatableRead,
		ReadOnly:     false,
		TxIdentifier: "prebuilds.deletePrebuiltWorkspace",
	})
	if err != nil {
		return err
	}

	// Publish provisioner job event to notify the acquirer that a new job was posted
	c.publishProvisionerJob(ctx, provisionerJob, prebuiltWorkspaceID)

	return nil
}

func (c *StoreReconciler) provision(
	ctx context.Context,
	db database.Store,
	prebuildID uuid.UUID,
	template database.Template,
	presetID uuid.UUID,
	transition database.WorkspaceTransition,
	workspace database.Workspace,
	mode DeprovisionMode,
) (*database.ProvisionerJob, error) {
	ctx, span := c.tracer.Start(ctx, "prebuilds.provision", trace.WithAttributes(
		attribute.String("prebuild_id", prebuildID.String()),
		attribute.String("template_id", template.ID.String()),
		attribute.String("preset_id", presetID.String()),
		attribute.String("transition", string(transition)),
		attribute.String("workspace_id", workspace.ID.String()),
		attribute.String("mode", mode.String()),
	))
	defer span.End()

	tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
	if err != nil {
		return nil, xerrors.Errorf("fetch preset details: %w", err)
	}

	var params []codersdk.WorkspaceBuildParameter
	for _, param := range tvp {
		// TODO: don't fetch in the first place.
		if param.TemplateVersionPresetID != presetID {
			continue
		}

		params = append(params, codersdk.WorkspaceBuildParameter{
			Name:  param.Name,
			Value: param.Value,
		})
	}

	builder := wsbuilder.New(workspace, transition, *c.buildUsageChecker.Load()).
		Reason(database.BuildReasonInitiator).
		Initiator(database.PrebuildsSystemUserID).
		MarkPrebuild()

	if transition != database.WorkspaceTransitionDelete {
		// We don't specify the version for a delete transition,
		// because the prebuilt workspace may have been created using an older template version.
		// If the version isn't explicitly set, the builder will automatically use the version
		// from the last workspace build — which is the desired behavior.
		builder = builder.VersionID(template.ActiveVersionID)

		// We only inject the required params when the prebuild is being created.
		// This mirrors the behavior of regular workspace deletion (see cli/delete.go).
		builder = builder.TemplateVersionPresetID(presetID)
		builder = builder.RichParameterValues(params)
	}

	// Use orphan mode for deletes when no Terraform resources exist
	if transition == database.WorkspaceTransitionDelete && mode == DeprovisionModeOrphan {
		builder = builder.Orphan()
	}

	// Strip trace context - provisionerd is a separate service and should
	// start its own trace rather than continuing the prebuilds trace.
	buildCtx := trace.ContextWithSpan(ctx, tracing.NoopSpan)

	_, provisionerJob, _, err := builder.Build(
		buildCtx,
		db,
		c.fileCache,
		func(_ policy.Action, _ rbac.Objecter) bool {
			return true // TODO: harden?
		},
		audit.WorkspaceBuildBaggage{},
	)
	if err != nil {
		return nil, xerrors.Errorf("provision workspace: %w", err)
	}
	if provisionerJob == nil {
		// This should not happen, builder.Build() should either return a job or an error.
		// Returning an error to fail fast if we hit this unexpected case.
		return nil, xerrors.Errorf("provision succeeded but returned no job")
	}

	c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition),
		slog.F("prebuild_id", prebuildID.String()), slog.F("preset_id", presetID.String()),
		slog.F("job_id", provisionerJob.ID))

	return provisionerJob, nil
}

// publishProvisionerJob publishes a provisioner job event to notify the acquirer that a new job has been created.
// This must be called after the database transaction that creates the job has committed to ensure
// the job is visible to provisioners when they query the database.
func (c *StoreReconciler) publishProvisionerJob(ctx context.Context, provisionerJob *database.ProvisionerJob, workspaceID uuid.UUID) {
	if provisionerJob == nil {
		return
	}
	select {
	case c.provisionNotifyCh <- *provisionerJob:
	default: // channel full, drop the message; provisioner will pick this job up later with its periodic check
		c.logger.Warn(ctx, "provisioner job notification queue full, dropping",
			slog.F("job_id", provisionerJob.ID), slog.F("prebuild_id", workspaceID.String()))
	}
}

// ForceMetricsUpdate forces the metrics collector, if defined, to update its state (we cache the metrics state to
// reduce load on the database).
func (c *StoreReconciler) ForceMetricsUpdate(ctx context.Context) error {
	if c.metrics == nil {
		return nil
	}

	return c.metrics.UpdateState(ctx, time.Second*10)
}

func (c *StoreReconciler) TrackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) {
	// nolint:gocritic // Necessary to query all the required data.
	ctx = dbauthz.AsSystemRestricted(ctx)
	// Since this may be called in a fire-and-forget fashion, we need to give up at some point.
	trackCtx, trackCancel := context.WithTimeout(ctx, time.Minute)
	defer trackCancel()

	if err := c.trackResourceReplacement(trackCtx, workspaceID, buildID, replacements); err != nil {
		c.logger.Error(ctx, "failed to track resource replacement", slog.Error(err))
	}
}

// nolint:revive // Shut up it's fine.
func (c *StoreReconciler) trackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) error {
	if err := ctx.Err(); err != nil {
		return err
	}

	workspace, err := c.store.GetWorkspaceByID(ctx, workspaceID)
	if err != nil {
		return xerrors.Errorf("fetch workspace %q: %w", workspaceID.String(), err)
	}

	build, err := c.store.GetWorkspaceBuildByID(ctx, buildID)
	if err != nil {
		return xerrors.Errorf("fetch workspace build %q: %w", buildID.String(), err)
	}

	// The first build will always be the prebuild.
	prebuild, err := c.store.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, database.GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams{
		WorkspaceID: workspaceID, BuildNumber: 1,
	})
	if err != nil {
		return xerrors.Errorf("fetch prebuild: %w", err)
	}

	// This should not be possible, but defend against it.
	if !prebuild.TemplateVersionPresetID.Valid || prebuild.TemplateVersionPresetID.UUID == uuid.Nil {
		return xerrors.Errorf("no preset used in prebuild for workspace %q", workspaceID.String())
	}

	prebuildPreset, err := c.store.GetPresetByID(ctx, prebuild.TemplateVersionPresetID.UUID)
	if err != nil {
		return xerrors.Errorf("fetch template preset for template version ID %q: %w", prebuild.TemplateVersionID.String(), err)
	}

	claimant, err := c.store.GetUserByID(ctx, workspace.OwnerID) // At this point, the workspace is owned by the new owner.
	if err != nil {
		return xerrors.Errorf("fetch claimant %q: %w", workspace.OwnerID.String(), err)
	}

	// Use the claiming build here (not prebuild) because both should be equivalent, and we might as well spot inconsistencies now.
	templateVersion, err := c.store.GetTemplateVersionByID(ctx, build.TemplateVersionID)
	if err != nil {
		return xerrors.Errorf("fetch template version %q: %w", build.TemplateVersionID.String(), err)
	}

	org, err := c.store.GetOrganizationByID(ctx, workspace.OrganizationID)
	if err != nil {
		return xerrors.Errorf("fetch org %q: %w", workspace.OrganizationID.String(), err)
	}

	// Track resource replacement in Prometheus metric.
	if c.metrics != nil {
		c.metrics.trackResourceReplacement(org.Name, workspace.TemplateName, prebuildPreset.Name)
	}

	// Send notification to template admins.
	if c.notifEnq == nil {
		c.logger.Warn(ctx, "notification enqueuer not set, cannot send resource replacement notification(s)")
		return nil
	}

	repls := make(map[string]string, len(replacements))
	for _, repl := range replacements {
		repls[repl.GetResource()] = strings.Join(repl.GetPaths(), ", ")
	}

	templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
		RbacRole: []string{codersdk.RoleTemplateAdmin},
	})
	if err != nil {
		return xerrors.Errorf("fetch template admins: %w", err)
	}

	var notifErr error
	for _, templateAdmin := range templateAdmins {
		if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.TemplateWorkspaceResourceReplaced,
			map[string]string{
				"org":                 org.Name,
				"workspace":           workspace.Name,
				"template":            workspace.TemplateName,
				"template_version":    templateVersion.Name,
				"preset":              prebuildPreset.Name,
				"workspace_build_num": fmt.Sprintf("%d", build.BuildNumber),
				"claimant":            claimant.Username,
			},
			map[string]any{
				"replacements": repls,
			}, "prebuilds_reconciler",
			// Associate this notification with all the related entities.
			workspace.ID, workspace.OwnerID, workspace.TemplateID, templateVersion.ID, prebuildPreset.ID, workspace.OrganizationID,
		); err != nil {
			notifErr = errors.Join(xerrors.Errorf("send notification to %q: %w", templateAdmin.ID.String(), err))
			continue
		}
	}

	return notifErr
}

type Settings struct {
	ReconciliationPaused bool `json:"reconciliation_paused"`
}

func SetPrebuildsReconciliationPaused(ctx context.Context, db database.Store, paused bool) error {
	settings := Settings{
		ReconciliationPaused: paused,
	}
	settingsJSON, err := json.Marshal(settings)
	if err != nil {
		return xerrors.Errorf("marshal settings: %w", err)
	}
	return db.UpsertPrebuildsSettings(ctx, string(settingsJSON))
}
