From c5d0d16600814d9577a92ba6f94fecabea58ae51 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 25 Sep 2023 14:00:33 +0100 Subject: [PATCH 1/6] update infra --- scaletest/terraform/infra/gcp_cluster.tf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scaletest/terraform/infra/gcp_cluster.tf b/scaletest/terraform/infra/gcp_cluster.tf index 577894790d7ff..7c43036ad061f 100644 --- a/scaletest/terraform/infra/gcp_cluster.tf +++ b/scaletest/terraform/infra/gcp_cluster.tf @@ -48,7 +48,10 @@ resource "google_container_node_pool" "coder" { location = var.zone project = var.project_id cluster = google_container_cluster.primary.name - node_count = var.state == "stopped" ? 0 : var.nodepool_size_coder + autoscaling { + min_node_count = 1 + max_node_count = var.nodepool_size_coder + } management { auto_upgrade = false } @@ -81,7 +84,10 @@ resource "google_container_node_pool" "workspaces" { location = var.zone project = var.project_id cluster = google_container_cluster.primary.name - node_count = var.state == "stopped" ? 0 : var.nodepool_size_workspaces + autoscaling { + min_node_count = 0 + max_node_count = var.nodepool_size_workspaces + } management { auto_upgrade = false } From e0abf25a4ac7c56a6cf746fdb39f62e75c9964e4 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 25 Sep 2023 16:58:17 +0100 Subject: [PATCH 2/6] remove usage of null_resource --- scaletest/terraform/infra/gcp_cluster.tf | 16 +- scaletest/terraform/k8s/coder.tf | 325 ++++++++++------------- scaletest/terraform/k8s/prometheus.tf | 81 ++---- scaletest/terraform/k8s/vars.tf | 5 + 4 files changed, 179 insertions(+), 248 deletions(-) diff --git a/scaletest/terraform/infra/gcp_cluster.tf b/scaletest/terraform/infra/gcp_cluster.tf index 7c43036ad061f..c96dddc6ae28a 100644 --- a/scaletest/terraform/infra/gcp_cluster.tf +++ b/scaletest/terraform/infra/gcp_cluster.tf @@ -44,10 +44,10 @@ resource "google_container_cluster" "primary" { } resource "google_container_node_pool" "coder" { - name = "${var.name}-coder" - location = var.zone - project = var.project_id - cluster = google_container_cluster.primary.name + name = "${var.name}-coder" + location = var.zone + project = var.project_id + cluster = google_container_cluster.primary.name autoscaling { min_node_count = 1 max_node_count = var.nodepool_size_coder @@ -80,10 +80,10 @@ resource "google_container_node_pool" "coder" { } resource "google_container_node_pool" "workspaces" { - name = "${var.name}-workspaces" - location = var.zone - project = var.project_id - cluster = google_container_cluster.primary.name + name = "${var.name}-workspaces" + location = var.zone + project = var.project_id + cluster = google_container_cluster.primary.name autoscaling { min_node_count = 0 max_node_count = var.nodepool_size_workspaces diff --git a/scaletest/terraform/k8s/coder.tf b/scaletest/terraform/k8s/coder.tf index f6b9ae7d16a09..c0522df8a127a 100644 --- a/scaletest/terraform/k8s/coder.tf +++ b/scaletest/terraform/k8s/coder.tf @@ -1,53 +1,164 @@ data "google_client_config" "default" {} locals { - coder_helm_repo = "https://helm.coder.com/v2" - coder_helm_chart = "coder" - coder_release_name = var.name - coder_namespace = "coder-${var.name}" - coder_admin_email = "admin@coder.com" - coder_admin_user = "coder" - coder_access_url = "http://${var.coder_address}" + coder_access_url = "http://${var.coder_address}" + coder_admin_email = "admin@coder.com" + coder_admin_user = "coder" + coder_helm_repo = "https://helm.coder.com/v2" + coder_helm_chart = "coder" + coder_namespace = "coder-${var.name}" + coder_release_name = var.name + provisionerd_helm_chart = "coder-provisioner" + provisionerd_release_name = "${var.name}-provisionerd" } -resource "null_resource" "coder_namespace" { - triggers = { - namespace = local.coder_namespace - kubeconfig_path = var.kubernetes_kubeconfig_path - } - provisioner "local-exec" { - when = create - command = < Date: Mon, 25 Sep 2023 17:01:18 +0100 Subject: [PATCH 3/6] fixup! remove usage of null_resource --- scaletest/terraform/k8s/coder.tf | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/scaletest/terraform/k8s/coder.tf b/scaletest/terraform/k8s/coder.tf index c0522df8a127a..ffe125dfd4788 100644 --- a/scaletest/terraform/k8s/coder.tf +++ b/scaletest/terraform/k8s/coder.tf @@ -180,7 +180,7 @@ coder: operator: "In" values: ["${local.coder_release_name}"] env: - - name: "CODER_ACCESS_URL" + - name: "CODER_URL" value: "${local.coder_access_url}" - name: "CODER_CACHE_DIRECTORY" value: "/tmp/coder" @@ -190,28 +190,14 @@ coder: value: "/dev/null" - name: "CODER_LOGGING_STACKDRIVER" value: "/dev/stderr" - - name: "CODER_PG_CONNECTION_URL" - valueFrom: - secretKeyRef: - name: "${kubernetes_secret.coder-db.metadata.0.name}" - key: url - name: "CODER_PPROF_ENABLE" value: "true" - name: "CODER_PROMETHEUS_ENABLE" value: "true" - - name: "CODER_PROMETHEUS_COLLECT_AGENT_STATS" - value: "true" - - name: "CODER_PROMETHEUS_COLLECT_DB_METRICS" - value: "true" - name: "CODER_VERBOSE" value: "true" - - name: "CODER_EXPERIMENTS" - value: "${var.coder_experiments}" - - name: "CODER_DANGEROUS_DISABLE_RATE_LIMITS" - value: "true" - # Disabling built-in provisioner daemons - - name: "CODER_PROVISIONER_DAEMONS" - value: "0" + - name: "CODER_PROVISIONERD_TAGS" + value = "socpe=organization" image: repo: ${var.coder_image_repo} tag: ${var.coder_image_tag} From 95660480d1bdd7e7ec1f6a0c09a3852b309d7bd8 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 25 Sep 2023 17:51:40 +0100 Subject: [PATCH 4/6] add OIDC and OTEL env vars --- scaletest/terraform/k8s/coder.tf | 40 ++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/scaletest/terraform/k8s/coder.tf b/scaletest/terraform/k8s/coder.tf index ffe125dfd4788..ee35e18342791 100644 --- a/scaletest/terraform/k8s/coder.tf +++ b/scaletest/terraform/k8s/coder.tf @@ -60,12 +60,20 @@ data "kubernetes_secret" "coder_tls" { } } +# Also need an OTEL collector deployed. Manual for now. +data "kubernetes_service" "otel_collector" { + metadata { + namespace = kubernetes_namespace.coder_namespace.metadata.0.name + name = "otel-collector" + } +} + resource "helm_release" "coder-chart" { repository = local.coder_helm_repo chart = local.coder_helm_chart name = local.coder_release_name version = var.coder_chart_version - namespace = kubernetes_namespace.coder_namespace + namespace = kubernetes_namespace.coder_namespace.metadata.0.name values = [< Date: Tue, 26 Sep 2023 15:50:37 +0100 Subject: [PATCH 5/6] make infra plan apply cleanly after import --- scaletest/terraform/infra/gcp_cluster.tf | 25 +++++++++++++++++++----- scaletest/terraform/infra/gcp_db.tf | 13 ++++++++++++ scaletest/terraform/infra/gcp_vpc.tf | 2 +- scaletest/terraform/infra/vars.tf | 5 +++++ 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/scaletest/terraform/infra/gcp_cluster.tf b/scaletest/terraform/infra/gcp_cluster.tf index c96dddc6ae28a..c37132c38071b 100644 --- a/scaletest/terraform/infra/gcp_cluster.tf +++ b/scaletest/terraform/infra/gcp_cluster.tf @@ -41,6 +41,15 @@ resource "google_container_cluster" "primary" { workload_identity_config { workload_pool = "${data.google_project.project.project_id}.svc.id.goog" } + + + lifecycle { + ignore_changes = [ + maintenance_policy, + release_channel, + remove_default_node_pool + ] + } } resource "google_container_node_pool" "coder" { @@ -52,9 +61,6 @@ resource "google_container_node_pool" "coder" { min_node_count = 1 max_node_count = var.nodepool_size_coder } - management { - auto_upgrade = false - } node_config { oauth_scopes = [ "https://www.googleapis.com/auth/logging.write", @@ -77,6 +83,9 @@ resource "google_container_node_pool" "coder" { disable-legacy-endpoints = "true" } } + lifecycle { + ignore_changes = [management[0].auto_repair, management[0].auto_upgrade, timeouts] + } } resource "google_container_node_pool" "workspaces" { @@ -85,8 +94,8 @@ resource "google_container_node_pool" "workspaces" { project = var.project_id cluster = google_container_cluster.primary.name autoscaling { - min_node_count = 0 - max_node_count = var.nodepool_size_workspaces + min_node_count = 0 + total_max_node_count = var.nodepool_size_workspaces } management { auto_upgrade = false @@ -113,6 +122,9 @@ resource "google_container_node_pool" "workspaces" { disable-legacy-endpoints = "true" } } + lifecycle { + ignore_changes = [management[0].auto_repair, management[0].auto_upgrade, timeouts] + } } resource "google_container_node_pool" "misc" { @@ -146,6 +158,9 @@ resource "google_container_node_pool" "misc" { disable-legacy-endpoints = "true" } } + lifecycle { + ignore_changes = [management[0].auto_repair, management[0].auto_upgrade, timeouts] + } } resource "null_resource" "cluster_kubeconfig" { diff --git a/scaletest/terraform/infra/gcp_db.tf b/scaletest/terraform/infra/gcp_db.tf index 1a02324ce071f..4d13b262c615f 100644 --- a/scaletest/terraform/infra/gcp_db.tf +++ b/scaletest/terraform/infra/gcp_db.tf @@ -32,6 +32,10 @@ resource "google_sql_database_instance" "db" { record_client_address = false } } + + lifecycle { + ignore_changes = [deletion_protection, timeouts] + } } resource "google_sql_database" "coder" { @@ -40,6 +44,9 @@ resource "google_sql_database" "coder" { name = "${var.name}-coder" # required for postgres, otherwise db fails to delete deletion_policy = "ABANDON" + lifecycle { + ignore_changes = [deletion_policy] + } } resource "random_password" "coder-postgres-password" { @@ -58,6 +65,9 @@ resource "google_sql_user" "coder" { password = random_password.coder-postgres-password.result # required for postgres, otherwise user fails to delete deletion_policy = "ABANDON" + lifecycle { + ignore_changes = [deletion_policy, password] + } } resource "google_sql_user" "prometheus" { @@ -68,6 +78,9 @@ resource "google_sql_user" "prometheus" { password = random_password.prometheus-postgres-password.result # required for postgres, otherwise user fails to delete deletion_policy = "ABANDON" + lifecycle { + ignore_changes = [deletion_policy, password] + } } locals { diff --git a/scaletest/terraform/infra/gcp_vpc.tf b/scaletest/terraform/infra/gcp_vpc.tf index eb965354c3917..b125c60cfd25a 100644 --- a/scaletest/terraform/infra/gcp_vpc.tf +++ b/scaletest/terraform/infra/gcp_vpc.tf @@ -12,7 +12,7 @@ resource "google_compute_subnetwork" "subnet" { project = var.project_id region = var.region network = google_compute_network.vpc.name - ip_cidr_range = "10.200.0.0/24" + ip_cidr_range = var.subnet_cidr } resource "google_compute_global_address" "sql_peering" { diff --git a/scaletest/terraform/infra/vars.tf b/scaletest/terraform/infra/vars.tf index e26e5fa54f7df..d9f5040918ba5 100644 --- a/scaletest/terraform/infra/vars.tf +++ b/scaletest/terraform/infra/vars.tf @@ -25,6 +25,11 @@ variable "zone" { default = "us-east1-c" } +variable "subnet_cidr" { + description = "CIDR range for the subnet." + default = "10.200.0.0/24" +} + variable "k8s_version" { description = "Kubernetes version to provision." default = "1.24" From add8f50530b50a17a77da7da26ac9ddf2283bfbe Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 26 Sep 2023 17:00:07 +0100 Subject: [PATCH 6/6] make k8s terraform apply somewhat cleanly --- scaletest/terraform/k8s/coder.tf | 49 ++++++++++++++------------- scaletest/terraform/k8s/prometheus.tf | 9 +++++ scaletest/terraform/k8s/vars.tf | 24 ++++++++----- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/scaletest/terraform/k8s/coder.tf b/scaletest/terraform/k8s/coder.tf index ee35e18342791..3c3670a8c20a9 100644 --- a/scaletest/terraform/k8s/coder.tf +++ b/scaletest/terraform/k8s/coder.tf @@ -1,7 +1,7 @@ data "google_client_config" "default" {} locals { - coder_access_url = "http://${var.coder_address}" + coder_url = var.coder_access_url == "" ? "http://${var.coder_address}" : var.coder_access_url coder_admin_email = "admin@coder.com" coder_admin_user = "coder" coder_helm_repo = "https://helm.coder.com/v2" @@ -16,6 +16,9 @@ resource "kubernetes_namespace" "coder_namespace" { metadata { name = local.coder_namespace } + lifecycle { + ignore_changes = [timeouts, wait_for_default_service_account] + } } resource "random_password" "provisionerd_psk" { @@ -31,6 +34,9 @@ resource "kubernetes_secret" "coder-db" { data = { url = var.coder_db_url } + lifecycle { + ignore_changes = [timeouts, wait_for_service_account_token] + } } resource "kubernetes_secret" "provisionerd_psk" { @@ -42,6 +48,9 @@ resource "kubernetes_secret" "provisionerd_psk" { data = { psk = random_password.provisionerd_psk.result } + lifecycle { + ignore_changes = [timeouts, wait_for_service_account_token] + } } # OIDC secret needs to be manually provisioned for now. @@ -96,10 +105,10 @@ coder: values: ["${local.coder_release_name}"] env: - name: "CODER_ACCESS_URL" - value: "${local.coder_access_url}" + value: "${local.coder_url}" - name: "CODER_CACHE_DIRECTORY" value: "/tmp/coder" - - name: "CODER_ENABLE_TELEMETRY" + - name: "CODER_TELEMETRY_ENABLE" value: "false" - name: "CODER_LOGGING_HUMAN" value: "/dev/null" @@ -189,7 +198,7 @@ EOF ] } -resource "helm_release" "provisionerd_chart" { +resource "helm_release" "provisionerd-chart" { repository = local.coder_helm_repo chart = local.provisionerd_helm_chart name = local.provisionerd_release_name @@ -217,40 +226,34 @@ coder: values: ["${local.coder_release_name}"] env: - name: "CODER_URL" - value: "${local.coder_access_url}" + value: "${local.coder_url}" + - name: "CODER_VERBOSE" + value: "true" - name: "CODER_CACHE_DIRECTORY" value: "/tmp/coder" - - name: "CODER_ENABLE_TELEMETRY" + - name: "CODER_TELEMETRY_ENABLE" value: "false" - name: "CODER_LOGGING_HUMAN" value: "/dev/null" - name: "CODER_LOGGING_STACKDRIVER" value: "/dev/stderr" - - name: "CODER_PPROF_ENABLE" - value: "true" - name: "CODER_PROMETHEUS_ENABLE" value: "true" - - name: "CODER_VERBOSE" - value: "true" - name: "CODER_PROVISIONERD_TAGS" value = "socpe=organization" image: - repo: ${var.coder_image_repo} - tag: ${var.coder_image_tag} - replicaCount: "${var.coder_replicas}" + repo: ${var.provisionerd_image_repo} + tag: ${var.provisionerd_image_tag} + replicaCount: "${var.provisionerd_replicas}" resources: requests: - cpu: "${var.coder_cpu_request}" - memory: "${var.coder_mem_request}" + cpu: "${var.provisionerd_cpu_request}" + memory: "${var.provisionerd_mem_request}" limits: - cpu: "${var.coder_cpu_limit}" - memory: "${var.coder_mem_limit}" + cpu: "${var.provisionerd_cpu_limit}" + memory: "${var.provisionerd_mem_limit}" securityContext: readOnlyRootFilesystem: true - service: - enable: true - sessionAffinity: None - loadBalancerIP: "${var.coder_address}" volumeMounts: - mountPath: "/tmp" name: cache @@ -353,10 +356,10 @@ resource "local_file" "kubernetes_template" { resource "local_file" "output_vars" { filename = "${path.module}/../../.coderv2/url" - content = local.coder_access_url + content = local.coder_url } output "coder_url" { description = "URL of the Coder deployment" - value = local.coder_access_url + value = local.coder_url } diff --git a/scaletest/terraform/k8s/prometheus.tf b/scaletest/terraform/k8s/prometheus.tf index c6346fdfcf5c8..accf926727575 100644 --- a/scaletest/terraform/k8s/prometheus.tf +++ b/scaletest/terraform/k8s/prometheus.tf @@ -14,6 +14,9 @@ resource "kubernetes_namespace" "prometheus_namespace" { metadata { name = local.prometheus_namespace } + lifecycle { + ignore_changes = [timeouts, wait_for_default_service_account] + } } # Create a secret to store the remote write key @@ -29,6 +32,9 @@ resource "kubernetes_secret" "prometheus-credentials" { username = var.prometheus_remote_write_user password = var.prometheus_remote_write_password } + lifecycle { + ignore_changes = [timeouts, wait_for_service_account_token] + } } # Install Prometheus using the Bitnami Prometheus helm chart. @@ -105,6 +111,9 @@ resource "kubernetes_secret" "prometheus-postgres-password" { username = var.prometheus_postgres_user password = var.prometheus_postgres_password } + lifecycle { + ignore_changes = [timeouts, wait_for_service_account_token] + } } # Install Prometheus Postgres exporter helm chart diff --git a/scaletest/terraform/k8s/vars.tf b/scaletest/terraform/k8s/vars.tf index b80735261f412..36d67fbd3639b 100644 --- a/scaletest/terraform/k8s/vars.tf +++ b/scaletest/terraform/k8s/vars.tf @@ -28,6 +28,9 @@ variable "kubernetes_nodepool_misc" { } // These variables control the Coder deployment. +variable "coder_access_url" { + description = "Access URL for the Coder deployment." +} variable "coder_replicas" { description = "Number of Coder replicas to provision." default = 1 @@ -68,12 +71,12 @@ variable "coder_mem_limit" { // Allow independently scaling provisionerd resources variable "provisionerd_cpu_request" { description = "CPU request to allocate to provisionerd." - default = "500m" + default = "100m" } variable "provisionerd_mem_request" { description = "Memory request to allocate to provisionerd." - default = "512Mi" + default = "1Gi" } variable "provisionerd_cpu_limit" { @@ -83,7 +86,7 @@ variable "provisionerd_cpu_limit" { variable "provisionerd_mem_limit" { description = "Memory limit to allocate to provisionerd." - default = "1024Mi" + default = "1Gi" } variable "provisionerd_replicas" { @@ -91,16 +94,21 @@ variable "provisionerd_replicas" { default = 1 } -variable "provisionerd_concurrency" { - description = "Number of concurrent provisioner jobs per provisionerd instance." - default = 3 -} - variable "provisionerd_chart_version" { description = "Version of the Provisionerd Helm chart to install. Defaults to latest." default = null } +variable "provisionerd_image_repo" { + description = "Repository to use for Provisionerd image." + default = "ghcr.io/coder/coder" +} + +variable "provisionerd_image_tag" { + description = "Tag to use for Provisionerd image." + default = "latest" +} + variable "coder_chart_version" { description = "Version of the Coder Helm chart to install. Defaults to latest." default = null