From 86ff71d1fa137eef6fbed4c78c2bb4be478e1f16 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:16:16 +0000 Subject: [PATCH 1/7] feat(scaletest): add grafana annotations --- scaletest/templates/scaletest-runner/main.tf | 24 +++- .../templates/scaletest-runner/scripts/lib.sh | 130 +++++++++++++++++- .../scaletest-runner/scripts/prepare.sh | 7 - .../templates/scaletest-runner/shutdown.sh | 4 + .../templates/scaletest-runner/startup.sh | 18 ++- 5 files changed, 168 insertions(+), 15 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 4802c9887793d..b4e80749f0e1a 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -43,6 +43,9 @@ locals { home_disk_size = 10 scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" + grafana_url = "https://stats.dev.c8s.io" + grafana_dashboard_uid = "qLVSTR-Vz" + grafana_dashboard_name = "coderv2-loadtest-dashboard" } data "coder_provisioner" "me" { @@ -237,6 +240,9 @@ resource "coder_agent" "main" { SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + GRAFANA_URL : local.grafana_url, + # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, + SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path), SCRIPTS_DIR : "/tmp/scripts", } @@ -332,7 +338,7 @@ resource "coder_app" "grafana" { agent_id = coder_agent.main.id slug = "00-grafana" display_name = "Grafana" - url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now" + url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now" icon = "https://grafana.com/static/assets/img/fav32.png" external = true } @@ -440,6 +446,15 @@ resource "kubernetes_pod" "main" { name = "CODER_AGENT_LOG_DIR" value = "${local.scaletest_run_dir}/logs" } + env { + name = "GRAFANA_API_TOKEN" + value_from { + secret_key_ref { + name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name + key = "token" + } + } + } resources { # Set requests and limits values such that we can do performant # execution of `coder scaletest` commands. @@ -505,6 +520,13 @@ resource "kubernetes_pod" "main" { } } +data "kubernetes_secret" "grafana_editor_api_token" { + metadata { + name = "grafana-editor-api-token" + namespace = data.coder_parameter.namespace.value + } +} + resource "kubernetes_manifest" "pod_monitor" { count = data.coder_workspace.me.start_count manifest = { diff --git a/scaletest/templates/scaletest-runner/scripts/lib.sh b/scaletest/templates/scaletest-runner/scripts/lib.sh index d392d09681f0a..0982eb01429ef 100644 --- a/scaletest/templates/scaletest-runner/scripts/lib.sh +++ b/scaletest/templates/scaletest-runner/scripts/lib.sh @@ -33,7 +33,13 @@ set_status() { if [[ ${DRY_RUN} == 1 ]]; then dry_run=" (dry-ryn)" fi + prev_status=$(get_status) + if [[ ${prev_status} != *"Not started"* ]]; then + annotate_grafana_end "status" "Status: ${prev_status}" + fi echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status" + + annotate_grafana "status" "Status: ${*}" } lock_status() { chmod 0440 "${SCALETEST_STATE_DIR}/status" @@ -51,25 +57,29 @@ phase_num=0 start_phase() { # This may be incremented from another script, so we read it every time. if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then - phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")" + phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}") fi phase_num=$((phase_num + 1)) log "Start phase ${phase_num}: ${*}" echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}" + + GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}" } end_phase() { - phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)" + phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-) if [[ -z ${phase} ]]; then log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}" exit 1 fi log "End phase ${phase_num}: ${phase}" echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}" + + GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}" } get_phase() { if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then - phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")" - phase="$(echo "${phase_raw}" | cut -d' ' -f3-)" + phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}") + phase=$(echo "${phase_raw}" | cut -d' ' -f3-) if [[ ${phase_raw} == *"END:"* ]]; then phase+=" [done]" fi @@ -86,9 +96,117 @@ get_previous_phase() { fi } +annotate_grafana() { + local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))} + local json resp id + + if [[ -z $tags ]]; then + tags="scaletest,runner" + else + tags="scaletest,runner,${tags}" + fi + if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then + tags="${tags},${GRAFANA_EXTRA_TAGS}" + fi + + log "Annotating Grafana (start=${start}): ${text} [${tags}]" + + json="$( + jq \ + --argjson time "${start}" \ + --arg text "${text}" \ + --arg tags "${tags}" \ + '{time: $time, tags: $tags | split(","), text: $text}' <<<'{}' + )" + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have annotated Grafana, data=${json}" + return 0 + fi + if ! resp="$( + curl -sSL \ + --insecure \ + -H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "${json}" \ + "${GRAFANA_URL}/api/annotations" + )"; then + # Don't abort scaletest just because we couldn't annotate Grafana. + log "Failed to annotate Grafana: ${resp}" + return 0 + fi + + if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then + log "Failed to annotate Grafana: ${resp}" + return 0 + fi + + log "Grafana annotation added!" + + if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then + mkdir -p "${SCALETEST_STATE_DIR}" + fi + id="$(jq -r '.id' <<<"${resp}")" + echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations" +} +annotate_grafana_end() { + local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))} + local id json resp + + if [[ -z $tags ]]; then + tags="scaletest,runner" + else + tags="scaletest,runner,${tags}" + fi + if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then + tags="${tags},${GRAFANA_EXTRA_TAGS}" + fi + + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]" + return 0 + fi + + if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then + log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..." + return 0 + fi + + log "Annotating Grafana (end=${end}): ${text} [${tags}]" + + json="$( + jq \ + --argjson timeEnd "${end}" \ + '{timeEnd: $timeEnd}' <<<'{}' + )" + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have patched Grafana annotation: id=${id}, data=${json}" + return 0 + fi + if ! resp="$( + curl -sSL \ + --insecure \ + -H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -X PATCH \ + -d "${json}" \ + "${GRAFANA_URL}/api/annotations/${id}" + )"; then + # Don't abort scaletest just because we couldn't annotate Grafana. + log "Failed to annotate Grafana end: ${resp}" + return 0 + fi + + if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then + log "Failed to annotate Grafana end: ${resp}" + return 0 + fi + + log "Grafana annotation patched!" +} + wait_baseline() { s=${1:-2} - start_phase "Waiting ${s}m to establish baseline" + PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline" maybedryrun "$DRY_RUN" sleep $((s * 60)) - end_phase + PHASE_TYPE="phase-wait" end_phase } diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh index f6fbcb7dd3227..2c20ace6f9cd6 100755 --- a/scaletest/templates/scaletest-runner/scripts/prepare.sh +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do fi done -log "Cloning coder/coder repo..." - -if [[ ! -d "${HOME}/coder" ]]; then - git clone https://github.com/coder/coder.git "${HOME}/coder" -fi -(cd "${HOME}/coder" && git pull) - log "Creating coder CLI token (needed for cleanup during shutdown)..." mkdir -p "${CODER_CONFIG_DIR}" diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh index fe621afe4c6c4..14d6023aaaa62 100755 --- a/scaletest/templates/scaletest-runner/shutdown.sh +++ b/scaletest/templates/scaletest-runner/shutdown.sh @@ -11,4 +11,8 @@ cleanup() { } trap cleanup EXIT +annotate_grafana "workspace" "Agent stopping..." + "${SCRIPTS_DIR}/cleanup.sh" shutdown + +annotate_grafana_end "workspace" "Agent running" diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 0d7c8fb144324..57151fcf2e1a4 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -12,9 +12,17 @@ mkdir -p "${SCRIPTS_DIR}" unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}" rm /tmp/scripts.zip +echo "Cloning coder/coder repo..." +if [[ ! -d "${HOME}/coder" ]]; then + git clone https://github.com/coder/coder.git "${HOME}/coder" +fi +(cd "${HOME}/coder" && git pull) + # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh . "${SCRIPTS_DIR}/lib.sh" +annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh. + # Show failure in the UI if script exits with error. failed_status=Failed on_exit() { @@ -38,15 +46,23 @@ on_exit() { "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" ;; esac + + annotate_grafana_end "" "Start scaletest" } trap on_exit EXIT on_err() { + code=${?} + trap - ERR + log "Scaletest failed!" - set_status "${failed_status}" + GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" lock_status # Ensure we never rewrite the status after a failure. } trap on_err ERR +annotate_grafana "" "Start scaletest" + "${SCRIPTS_DIR}/prepare.sh" + "${SCRIPTS_DIR}/run.sh" From 2deffab519ade1c7d09012431e9e661bbe6b64d5 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:16:16 +0000 Subject: [PATCH 2/7] feat(scaletest): add slack reporting --- scaletest/templates/scaletest-runner/main.tf | 126 +++++++++++++++--- .../scaletest-runner/scripts/cleanup.sh | 2 +- .../scaletest-runner/scripts/report.sh | 103 ++++++++++++++ .../templates/scaletest-runner/scripts/run.sh | 79 ++++++----- .../templates/scaletest-runner/startup.sh | 14 +- 5 files changed, 267 insertions(+), 57 deletions(-) create mode 100755 scaletest/templates/scaletest-runner/scripts/report.sh diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b4e80749f0e1a..a1b3b52735770 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -38,8 +38,8 @@ locals { workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" service_account_name = "scaletest-sa" - cpu = 2 - memory = 2 + cpu = 16 + memory = 64 home_disk_size = 10 scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" @@ -94,15 +94,14 @@ data "coder_parameter" "job_concurrency" { order = 11 type = "number" name = "Job concurrency" - default = 10 + default = 0 description = "The number of concurrent jobs (e.g. when producing workspace traffic)." mutable = true # Setting zero = unlimited, but perhaps not a good idea, # we can raise this limit instead. validation { - min = 1 - max = 100 + min = 0 } } @@ -200,6 +199,73 @@ data "coder_parameter" "num_workspaces" { } } + +data "coder_parameter" "load_scenarios" { + order = 22 + name = "Load Scenarios" + type = "list(string)" + description = "The load scenarios to run." + mutable = true + ephemeral = true + default = jsonencode([ + "SSH Traffic", + "Web Terminal Traffic", + "Dashboard Traffic", + ]) +} + +data "coder_parameter" "load_scenario_ssh_traffic_duration" { + order = 23 + name = "SSH Traffic Duration" + type = "number" + description = "The duration of the SSH traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { + order = 24 + name = "Web Terminal Traffic Duration" + type = "number" + description = "The duration of the web terminal traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_dashboard_traffic_duration" { + order = 25 + name = "Dashboard Traffic Duration" + type = "number" + description = "The duration of the dashboard traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_baseline_duration" { + order = 26 + name = "Baseline Wait Duration" + type = "number" + description = "The duration to wait before starting a load scenario in minutes." + mutable = true + default = 5 + validation { + min = 0 + max = 60 + } +} + data "coder_parameter" "namespace" { order = 999 type = "string" @@ -224,6 +290,8 @@ resource "coder_agent" "main" { CODER_CONFIG_DIR : "/home/coder/.config/coderv2", CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token, CODER_URL : data.coder_workspace.me.access_url, + CODER_USER : data.coder_workspace.me.owner, + CODER_WORKSPACE : data.coder_workspace.me.name, # Global scaletest envs that may affect each `coder exp scaletest` invocation. CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112", @@ -231,14 +299,23 @@ resource "coder_agent" "main" { CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}", CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}", + # Expose as params as well, for reporting (TODO(mafredri): refactor, only have one). + SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}", + SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}", + # Local envs passed as arguments to `coder exp scaletest` invocations. SCALETEST_RUN_ID : local.scaletest_run_id, SCALETEST_RUN_DIR : local.scaletest_run_dir, - SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value, - SCALETEST_SKIP_CLEANUP : "1", - SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, - SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", - SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + + SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value, + SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, + SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", + SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", GRAFANA_URL : local.grafana_url, # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, @@ -250,12 +327,13 @@ resource "coder_agent" "main" { vscode = false ssh_helper = false } - startup_script_timeout = 3600 - shutdown_script_timeout = 1800 + startup_script_timeout = 86400 + shutdown_script_timeout = 7200 startup_script_behavior = "blocking" startup_script = file("startup.sh") shutdown_script = file("shutdown.sh") + # IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item. # Scaletest metadata. metadata { display_name = "Scaletest status" @@ -415,7 +493,7 @@ resource "kubernetes_pod" "main" { } # Set the pod delete timeout to termination_grace_period_seconds + 1m. timeouts { - delete = "32m" + delete = "122m" } spec { security_context { @@ -427,8 +505,8 @@ resource "kubernetes_pod" "main" { service_account_name = local.service_account_name # Allow the coder agent to perform graceful shutdown and cleanup of - # scaletest resources, 30 minutes (cleanup timeout) + 1 minute. - termination_grace_period_seconds = 1860 + # scaletest resources, 2 hours (cleanup timeout) + 1 minute. + termination_grace_period_seconds = 7260 container { name = "dev" @@ -455,6 +533,15 @@ resource "kubernetes_pod" "main" { } } } + env { + name = "SLACK_WEBHOOK_URL" + value_from { + secret_key_ref { + name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name + key = "url" + } + } + } resources { # Set requests and limits values such that we can do performant # execution of `coder scaletest` commands. @@ -511,7 +598,7 @@ resource "kubernetes_pod" "main" { match_expressions { key = "cloud.google.com/gke-nodepool" operator = "In" - values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces. + values = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces. } } } @@ -527,6 +614,13 @@ data "kubernetes_secret" "grafana_editor_api_token" { } } +data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" { + metadata { + name = "slack-scaletest-notifications-webhook-url" + namespace = data.coder_parameter.namespace.value + } +} + resource "kubernetes_manifest" "pod_monitor" { count = data.coder_workspace.me.start_count manifest = { diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index a6d29211a080b..9d2c23463249e 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -24,7 +24,7 @@ fi start_phase "Cleanup (${event})" coder exp scaletest cleanup \ --cleanup-job-timeout 15m \ - --cleanup-timeout 30m | + --cleanup-timeout 2h | tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" end_phase diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh new file mode 100755 index 0000000000000..bb7cc393ef8ba --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +status=$1 +shift + +case "${status}" in +started) ;; +completed) ;; +failed) ;; +*) + echo "Unknown status: ${status}" >&2 + exit 1 + ;; +esac + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +# NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`. +CODER_URL="${CODER_URL%/}" +buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")" +server_version="$(jq -r '.version' <<<"${buildinfo}")" +server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")" + +# Since `coder show` doesn't support JSON output, we list the workspaces instead. +workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')" +owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")" +workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")" +initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")" + +bullet='•' +app_urls_raw="$(jq -r '.latest_build.resources[].agents[]?.apps | map(select(.external == true)) | .[] | .display_name, .url' <<<"${workspace_json}")" +app_urls=() +while read -r app_name; do + read -r app_url + bold= + if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then + # Update Grafana URL with end stamp and make bold. + app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}" + bold='**' + fi + app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}") +done <<<"${app_urls_raw}" + +params=() +header= + +case "${status}" in +started) + created_at="$(jq -r '.latest_build.created_at' <<<"${workspace_json}")" + params=("${bullet} Options:") + while read -r param; do + params+=(" ${bullet} ${param}") + done <<<"$(jq -r '.latest_build.resources[].agents[]?.environment_variables | to_entries | map(select(.key | startswith("SCALETEST_PARAM_"))) | .[] | "`\(.key)`: `\(.value)`"' <<<"${workspace_json}")" + + header="New scaletest started at \`${created_at}\` by \`${initiator_name}\` on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +completed) + completed_at=$(date -Iseconds) + header="Scaletest completed at \`${completed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +failed) + failed_at=$(date -Iseconds) + header="Scaletest failed at \`${failed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +*) + echo "Unknown status: ${status}" >&2 + exit 1 + ;; +esac + +text_arr=( + "${header}" + "" + "${bullet} Workspace (runner): ${CODER_URL}@${owner_name}/${workspace_name}" + "${bullet} Run ID: ${SCALETEST_RUN_ID}" + "${app_urls[@]}" + "${params[@]}" +) + +text= +for field in "${text_arr[@]}"; do + text+="${field}"$'\n' +done + +json=$( + jq -n --arg text "${text}" '{ + blocks: [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": $text + } + } + ] + }' +) + +maybedryrun "${DRY_RUN}" curl -X POST -H 'Content-type: application/json' --data "${json}" "${SLACK_WEBHOOK_URL}" diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 7ebf8c4310593..294cc956cc4ab 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -6,54 +6,61 @@ set -euo pipefail # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh . "${SCRIPTS_DIR}/lib.sh" +mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}") +export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}") + log "Running scaletest..." set_status Running start_phase "Creating workspaces" coder exp scaletest create-workspaces \ - --count "${SCALETEST_NUM_WORKSPACES}" \ - --template "${SCALETEST_TEMPLATE}" \ - --concurrency "${SCALETEST_CREATE_CONCURRENCY}" \ - --job-timeout 15m \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ + --job-timeout 2h \ --no-cleanup \ --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" end_phase -wait_baseline 5 - -start_phase "SSH traffic" -coder exp scaletest workspace-traffic \ - --ssh \ - --bytes-per-tick 10240 \ - --tick-interval 1s \ - --timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" -end_phase - -wait_baseline 5 +wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" -start_phase "ReconnectingPTY traffic" -coder exp scaletest workspace-traffic \ - --bytes-per-tick 10240 \ - --tick-interval 1s \ - --timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" -end_phase - -wait_baseline 5 - -start_phase "Dashboard traffic" -coder exp scaletest dashboard \ - --count "${SCALETEST_NUM_WORKSPACES}" \ - --job-timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" -end_phase +for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do + start_phase "Load scenario: ${scenario}" + case "${scenario}" in + "SSH Traffic") + coder exp scaletest workspace-traffic \ + --ssh \ + --bytes-per-tick 1024 \ + --tick-interval 100ms \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + ;; + "Web Terminal Traffic") + coder exp scaletest workspace-traffic \ + --bytes-per-tick 1024 \ + --tick-interval 100ms \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + ;; + "Dashboard Traffic") + coder exp scaletest dashboard \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ + >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" + show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + ;; + esac + end_phase -wait_baseline 5 + wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" +done log "Scaletest complete!" set_status Complete diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 57151fcf2e1a4..e0ea9316a9be8 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -28,22 +28,22 @@ failed_status=Failed on_exit() { trap - ERR EXIT - case "${SCALETEST_CLEANUP_STRATEGY}" in + case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in on_stop) # Handled by shutdown script. ;; on_success) if [[ $(get_status) != "${failed_status}" ]]; then - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; on_error) if [[ $(get_status) = "${failed_status}" ]]; then - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; *) - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" ;; esac @@ -54,15 +54,21 @@ trap on_exit EXIT on_err() { code=${?} trap - ERR + set +e log "Scaletest failed!" GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" + "${SCRIPTS_DIR}/report.sh" failed lock_status # Ensure we never rewrite the status after a failure. } trap on_err ERR +# Pass session token since `prepare.sh` has not yet run. +CODER_SESSION_TOKEN=$CODER_USER_TOKEN "${SCRIPTS_DIR}/report.sh" started annotate_grafana "" "Start scaletest" "${SCRIPTS_DIR}/prepare.sh" "${SCRIPTS_DIR}/run.sh" + +"${SCRIPTS_DIR}/report.sh" completed From a9beb89a7ef348c4df970cdd1d57ca6638987c28 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:23:22 +0000 Subject: [PATCH 3/7] fix url --- scaletest/templates/scaletest-runner/scripts/report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index bb7cc393ef8ba..d9bcaa2e08258 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -75,7 +75,7 @@ esac text_arr=( "${header}" "" - "${bullet} Workspace (runner): ${CODER_URL}@${owner_name}/${workspace_name}" + "${bullet} Workspace (runner): ${CODER_URL}/@${owner_name}/${workspace_name}" "${bullet} Run ID: ${SCALETEST_RUN_ID}" "${app_urls[@]}" "${params[@]}" From 2ef0e8fe0dd06f8d0b78cb3cde0cedbb2621263f Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 13:51:45 +0000 Subject: [PATCH 4/7] try signle * for bold slack message --- scaletest/templates/scaletest-runner/scripts/report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index d9bcaa2e08258..a6f11318ab6c4 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -40,7 +40,7 @@ while read -r app_name; do if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then # Update Grafana URL with end stamp and make bold. app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}" - bold='**' + bold='*' fi app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}") done <<<"${app_urls_raw}" From 1414f739a003727fd0c8fdc903b482e74a393e56 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 17:43:24 +0000 Subject: [PATCH 5/7] non-hardcoded grace period --- scaletest/templates/scaletest-runner/main.tf | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index a1b3b52735770..b3b74cb54af6a 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -35,17 +35,18 @@ resource "null_resource" "permission_check" { } locals { - workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" - workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" - service_account_name = "scaletest-sa" - cpu = 16 - memory = 64 - home_disk_size = 10 - scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" - scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" - grafana_url = "https://stats.dev.c8s.io" - grafana_dashboard_uid = "qLVSTR-Vz" - grafana_dashboard_name = "coderv2-loadtest-dashboard" + workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout). + service_account_name = "scaletest-sa" + cpu = 16 + memory = 64 + home_disk_size = 10 + scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" + scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" + grafana_url = "https://stats.dev.c8s.io" + grafana_dashboard_uid = "qLVSTR-Vz" + grafana_dashboard_name = "coderv2-loadtest-dashboard" } data "coder_provisioner" "me" { @@ -318,7 +319,6 @@ resource "coder_agent" "main" { SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", GRAFANA_URL : local.grafana_url, - # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path), SCRIPTS_DIR : "/tmp/scripts", @@ -493,7 +493,7 @@ resource "kubernetes_pod" "main" { } # Set the pod delete timeout to termination_grace_period_seconds + 1m. timeouts { - delete = "122m" + delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s" } spec { security_context { @@ -505,8 +505,9 @@ resource "kubernetes_pod" "main" { service_account_name = local.service_account_name # Allow the coder agent to perform graceful shutdown and cleanup of - # scaletest resources, 2 hours (cleanup timeout) + 1 minute. - termination_grace_period_seconds = 7260 + # scaletest resources. We add an extra minute so ensure work + # completion is prioritized over timeout. + termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60 container { name = "dev" From 73dfa18950f74d371fdc621b649e90f9a837b15a Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 17:51:13 +0000 Subject: [PATCH 6/7] add params for tick bytes/interval --- scaletest/templates/scaletest-runner/main.tf | 56 ++++++++++++++++++- .../templates/scaletest-runner/scripts/run.sh | 8 +-- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b3b74cb54af6a..c5b93d00978e1 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -228,8 +228,32 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" { } } -data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { +data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { order = 24 + name = "SSH Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the SSH traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_ssh_tick_interval" { + order = 25 + name = "SSH Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the SSH traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { + order = 26 name = "Web Terminal Traffic Duration" type = "number" description = "The duration of the web terminal traffic load scenario in minutes." @@ -241,8 +265,32 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { } } +data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { + order = 27 + name = "Web Terminal Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the web terminal traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_web_terminal_tick_interval" { + order = 28 + name = "Web Terminal Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the web terminal traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + data "coder_parameter" "load_scenario_dashboard_traffic_duration" { - order = 25 + order = 29 name = "Dashboard Traffic Duration" type = "number" description = "The duration of the dashboard traffic load scenario in minutes." @@ -314,7 +362,11 @@ resource "coder_agent" "main" { SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}", SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 294cc956cc4ab..1197283f82b8d 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -31,8 +31,8 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do "SSH Traffic") coder exp scaletest workspace-traffic \ --ssh \ - --bytes-per-tick 1024 \ - --tick-interval 100ms \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" @@ -40,8 +40,8 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do ;; "Web Terminal Traffic") coder exp scaletest workspace-traffic \ - --bytes-per-tick 1024 \ - --tick-interval 100ms \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" From b92ead16bce5b6413a752635e7825d426baf5d95 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 21:25:31 +0300 Subject: [PATCH 7/7] Update scaletest/templates/scaletest-runner/scripts/report.sh --- scaletest/templates/scaletest-runner/scripts/report.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index a6f11318ab6c4..453d4e53c6e16 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -20,6 +20,7 @@ esac . "${SCRIPTS_DIR}/lib.sh" # NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`. +# https://github.com/coder/coder/issues/9877 CODER_URL="${CODER_URL%/}" buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")" server_version="$(jq -r '.version' <<<"${buildinfo}")"