Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d8515f0

Browse files
authored
feat(scaletest): add grafana annotations and slack reporting (#9852)
Fixes #9575 Fixes #9576
1 parent 4e44204 commit d8515f0

File tree

8 files changed

+495
-78
lines changed

8 files changed

+495
-78
lines changed

scaletest/templates/scaletest-runner/main.tf

+192-23
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,18 @@ resource "null_resource" "permission_check" {
3535
}
3636

3737
locals {
38-
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
39-
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
40-
service_account_name = "scaletest-sa"
41-
cpu = 2
42-
memory = 2
43-
home_disk_size = 10
44-
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
45-
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
38+
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
39+
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
40+
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
41+
service_account_name = "scaletest-sa"
42+
cpu = 16
43+
memory = 64
44+
home_disk_size = 10
45+
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
46+
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
47+
grafana_url = "https://stats.dev.c8s.io"
48+
grafana_dashboard_uid = "qLVSTR-Vz"
49+
grafana_dashboard_name = "coderv2-loadtest-dashboard"
4650
}
4751

4852
data "coder_provisioner" "me" {
@@ -91,15 +95,14 @@ data "coder_parameter" "job_concurrency" {
9195
order = 11
9296
type = "number"
9397
name = "Job concurrency"
94-
default = 10
98+
default = 0
9599
description = "The number of concurrent jobs (e.g. when producing workspace traffic)."
96100
mutable = true
97101

98102
# Setting zero = unlimited, but perhaps not a good idea,
99103
# we can raise this limit instead.
100104
validation {
101-
min = 1
102-
max = 100
105+
min = 0
103106
}
104107
}
105108

@@ -197,6 +200,121 @@ data "coder_parameter" "num_workspaces" {
197200
}
198201
}
199202

203+
204+
data "coder_parameter" "load_scenarios" {
205+
order = 22
206+
name = "Load Scenarios"
207+
type = "list(string)"
208+
description = "The load scenarios to run."
209+
mutable = true
210+
ephemeral = true
211+
default = jsonencode([
212+
"SSH Traffic",
213+
"Web Terminal Traffic",
214+
"Dashboard Traffic",
215+
])
216+
}
217+
218+
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
219+
order = 23
220+
name = "SSH Traffic Duration"
221+
type = "number"
222+
description = "The duration of the SSH traffic load scenario in minutes."
223+
mutable = true
224+
default = 30
225+
validation {
226+
min = 1
227+
max = 1440 // 24 hours.
228+
}
229+
}
230+
231+
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
232+
order = 24
233+
name = "SSH Bytes Per Tick"
234+
type = "number"
235+
description = "The number of bytes to send per tick in the SSH traffic load scenario."
236+
mutable = true
237+
default = 1024
238+
validation {
239+
min = 1
240+
}
241+
}
242+
243+
data "coder_parameter" "load_scenario_ssh_tick_interval" {
244+
order = 25
245+
name = "SSH Tick Interval"
246+
type = "number"
247+
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
248+
mutable = true
249+
default = 100
250+
validation {
251+
min = 1
252+
}
253+
}
254+
255+
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
256+
order = 26
257+
name = "Web Terminal Traffic Duration"
258+
type = "number"
259+
description = "The duration of the web terminal traffic load scenario in minutes."
260+
mutable = true
261+
default = 30
262+
validation {
263+
min = 1
264+
max = 1440 // 24 hours.
265+
}
266+
}
267+
268+
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
269+
order = 27
270+
name = "Web Terminal Bytes Per Tick"
271+
type = "number"
272+
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
273+
mutable = true
274+
default = 1024
275+
validation {
276+
min = 1
277+
}
278+
}
279+
280+
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
281+
order = 28
282+
name = "Web Terminal Tick Interval"
283+
type = "number"
284+
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
285+
mutable = true
286+
default = 100
287+
validation {
288+
min = 1
289+
}
290+
}
291+
292+
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
293+
order = 29
294+
name = "Dashboard Traffic Duration"
295+
type = "number"
296+
description = "The duration of the dashboard traffic load scenario in minutes."
297+
mutable = true
298+
default = 30
299+
validation {
300+
min = 1
301+
max = 1440 // 24 hours.
302+
}
303+
}
304+
305+
data "coder_parameter" "load_scenario_baseline_duration" {
306+
order = 26
307+
name = "Baseline Wait Duration"
308+
type = "number"
309+
description = "The duration to wait before starting a load scenario in minutes."
310+
mutable = true
311+
default = 5
312+
validation {
313+
min = 0
314+
max = 60
315+
}
316+
}
317+
200318
data "coder_parameter" "namespace" {
201319
order = 999
202320
type = "string"
@@ -221,21 +339,38 @@ resource "coder_agent" "main" {
221339
CODER_CONFIG_DIR : "/home/coder/.config/coderv2",
222340
CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token,
223341
CODER_URL : data.coder_workspace.me.access_url,
342+
CODER_USER : data.coder_workspace.me.owner,
343+
CODER_WORKSPACE : data.coder_workspace.me.name,
224344

225345
# Global scaletest envs that may affect each `coder exp scaletest` invocation.
226346
CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112",
227347
CODER_SCALETEST_PROMETHEUS_WAIT : "60s",
228348
CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
229349
CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
230350

351+
# Expose as params as well, for reporting (TODO(mafredri): refactor, only have one).
352+
SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
353+
SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
354+
231355
# Local envs passed as arguments to `coder exp scaletest` invocations.
232356
SCALETEST_RUN_ID : local.scaletest_run_id,
233357
SCALETEST_RUN_DIR : local.scaletest_run_dir,
234-
SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value,
235-
SCALETEST_SKIP_CLEANUP : "1",
236-
SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
237-
SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
238-
SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
358+
359+
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
360+
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
361+
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
362+
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
363+
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
364+
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
365+
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
366+
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
367+
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
368+
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
369+
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
370+
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
371+
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
372+
373+
GRAFANA_URL : local.grafana_url,
239374

240375
SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
241376
SCRIPTS_DIR : "/tmp/scripts",
@@ -244,12 +379,13 @@ resource "coder_agent" "main" {
244379
vscode = false
245380
ssh_helper = false
246381
}
247-
startup_script_timeout = 3600
248-
shutdown_script_timeout = 1800
382+
startup_script_timeout = 86400
383+
shutdown_script_timeout = 7200
249384
startup_script_behavior = "blocking"
250385
startup_script = file("startup.sh")
251386
shutdown_script = file("shutdown.sh")
252387

388+
# IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item.
253389
# Scaletest metadata.
254390
metadata {
255391
display_name = "Scaletest status"
@@ -332,7 +468,7 @@ resource "coder_app" "grafana" {
332468
agent_id = coder_agent.main.id
333469
slug = "00-grafana"
334470
display_name = "Grafana"
335-
url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
471+
url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
336472
icon = "https://grafana.com/static/assets/img/fav32.png"
337473
external = true
338474
}
@@ -409,7 +545,7 @@ resource "kubernetes_pod" "main" {
409545
}
410546
# Set the pod delete timeout to termination_grace_period_seconds + 1m.
411547
timeouts {
412-
delete = "32m"
548+
delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s"
413549
}
414550
spec {
415551
security_context {
@@ -421,8 +557,9 @@ resource "kubernetes_pod" "main" {
421557
service_account_name = local.service_account_name
422558

423559
# Allow the coder agent to perform graceful shutdown and cleanup of
424-
# scaletest resources, 30 minutes (cleanup timeout) + 1 minute.
425-
termination_grace_period_seconds = 1860
560+
# scaletest resources. We add an extra minute so ensure work
561+
# completion is prioritized over timeout.
562+
termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60
426563

427564
container {
428565
name = "dev"
@@ -440,6 +577,24 @@ resource "kubernetes_pod" "main" {
440577
name = "CODER_AGENT_LOG_DIR"
441578
value = "${local.scaletest_run_dir}/logs"
442579
}
580+
env {
581+
name = "GRAFANA_API_TOKEN"
582+
value_from {
583+
secret_key_ref {
584+
name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name
585+
key = "token"
586+
}
587+
}
588+
}
589+
env {
590+
name = "SLACK_WEBHOOK_URL"
591+
value_from {
592+
secret_key_ref {
593+
name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name
594+
key = "url"
595+
}
596+
}
597+
}
443598
resources {
444599
# Set requests and limits values such that we can do performant
445600
# execution of `coder scaletest` commands.
@@ -496,7 +651,7 @@ resource "kubernetes_pod" "main" {
496651
match_expressions {
497652
key = "cloud.google.com/gke-nodepool"
498653
operator = "In"
499-
values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces.
654+
values = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces.
500655
}
501656
}
502657
}
@@ -505,6 +660,20 @@ resource "kubernetes_pod" "main" {
505660
}
506661
}
507662

663+
data "kubernetes_secret" "grafana_editor_api_token" {
664+
metadata {
665+
name = "grafana-editor-api-token"
666+
namespace = data.coder_parameter.namespace.value
667+
}
668+
}
669+
670+
data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" {
671+
metadata {
672+
name = "slack-scaletest-notifications-webhook-url"
673+
namespace = data.coder_parameter.namespace.value
674+
}
675+
}
676+
508677
resource "kubernetes_manifest" "pod_monitor" {
509678
count = data.coder_workspace.me.start_count
510679
manifest = {

scaletest/templates/scaletest-runner/scripts/cleanup.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ fi
2424
start_phase "Cleanup (${event})"
2525
coder exp scaletest cleanup \
2626
--cleanup-job-timeout 15m \
27-
--cleanup-timeout 30m |
27+
--cleanup-timeout 2h |
2828
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
2929
end_phase
3030

0 commit comments

Comments
 (0)