diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index c5b93d00978e1..82653a533dcb6 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -37,7 +37,7 @@ resource "null_resource" "permission_check" { locals { workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" - workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout). + workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout). service_account_name = "scaletest-sa" cpu = 16 memory = 64 diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index 9d2c23463249e..cf34da8297192 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -23,8 +23,8 @@ fi start_phase "Cleanup (${event})" coder exp scaletest cleanup \ - --cleanup-job-timeout 15m \ - --cleanup-timeout 2h | + --cleanup-job-timeout 2h \ + --cleanup-timeout 5h | tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" end_phase diff --git a/scaletest/templates/scaletest-runner/scripts/lib.sh b/scaletest/templates/scaletest-runner/scripts/lib.sh index 0982eb01429ef..a9df0a10eba0a 100644 --- a/scaletest/templates/scaletest-runner/scripts/lib.sh +++ b/scaletest/templates/scaletest-runner/scripts/lib.sh @@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state" SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase" # shellcheck disable=SC2034 SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results" +SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof" + +mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}" coder() { maybedryrun "${DRY_RUN}" command coder "${@}" @@ -142,9 +145,6 @@ annotate_grafana() { log "Grafana annotation added!" - if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then - mkdir -p "${SCALETEST_STATE_DIR}" - fi id="$(jq -r '.id' <<<"${resp}")" echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations" } diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh index 2c20ace6f9cd6..055d0afc90b3b 100755 --- a/scaletest/templates/scaletest-runner/scripts/prepare.sh +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url" set +x # Avoid logging the token. # Persist configuration for shutdown script too since the # owner token is invalidated immediately on workspace stop. -export CODER_SESSION_TOKEN=$CODER_USER_TOKEN +export CODER_SESSION_TOKEN=${CODER_USER_TOKEN} coder tokens delete scaletest_runner >/dev/null 2>&1 || true # TODO(mafredri): Set TTL? This could interfere with delayed stop though. token=$(coder tokens create --name scaletest_runner) +if [[ $DRY_RUN == 1 ]]; then + token=${CODER_SESSION_TOKEN} +fi unset CODER_SESSION_TOKEN echo -n "${token}" >"${CODER_CONFIG_DIR}/session" [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled). diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index 453d4e53c6e16..68947917ab9c4 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")" server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")" # Since `coder show` doesn't support JSON output, we list the workspaces instead. -workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')" +# Use `command` here to bypass dry run. +workspace_json="$( + command coder list --all --output json | + jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]' +)" owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")" workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")" initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")" @@ -43,7 +47,7 @@ while read -r app_name; do app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}" bold='*' fi - app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}") + app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}") done <<<"${app_urls_raw}" params=() diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 1197283f82b8d..c9053e5a6b15f 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \ --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ - --job-timeout 2h \ + --job-timeout 5h \ --no-cleanup \ --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index e0ea9316a9be8..df0f128b1ccd9 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -23,22 +23,58 @@ fi annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh. +{ + pids=() + ports=() + declare -A pods=() + next_port=6061 + for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do + maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" & + pids+=($!) + ports+=("${next_port}") + pods[${next_port}]="${pod}" + next_port=$((next_port + 1)) + done + + trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT + + while :; do + sleep 285 # ~300 when accounting for profile and trace. + log "Grabbing pprof dumps" + start="$(date +%s)" + annotate_grafana "pprof" "Grab pprof dumps (start=${start})" + for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do + for port in "${ports[@]}"; do + tidy_type="${type//\?/_}" + tidy_type="${tidy_type//=/_}" + maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}" + done + done + annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})" + done +} & +pprof_pid=$! + # Show failure in the UI if script exits with error. failed_status=Failed on_exit() { + code=${?} trap - ERR EXIT + set +e + + kill -INT "${pprof_pid}" case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in on_stop) # Handled by shutdown script. ;; on_success) - if [[ $(get_status) != "${failed_status}" ]]; then + if ((code == 0)); then "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; on_error) - if [[ $(get_status) = "${failed_status}" ]]; then + if ((code > 0)); then "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; @@ -60,6 +96,8 @@ on_err() { GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" "${SCRIPTS_DIR}/report.sh" failed lock_status # Ensure we never rewrite the status after a failure. + + exit "${code}" } trap on_err ERR