feat(nri-resctrl-plugin): add integration tests #38

Workflow file for this run

.github/workflows/benchmark.yaml at 4201513

	name: benchmark
	on:
	workflow_dispatch: # Manual trigger for testing
	inputs:
	instance-type:
	description: 'EC2 instance type to use'
	required: false
	default: 'm7i.metal-24xl'
	type: string
	image-type:
	description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)'
	required: false
	default: 'ubuntu-22.04'
	type: string
	pidstat-period:
	description: 'Collection frequency for pidstat in seconds'
	required: false
	default: '1'
	type: string
	steady-state-minutes:
	description: 'How long to run the steady-state workload in minutes'
	required: false
	default: '1'
	type: string
	trace-duration:
	description: 'Duration for trace collection in seconds'
	required: false
	default: '10'
	type: string
	collector-repository:
	description: 'Collector image repository to use'
	required: false
	default: 'ghcr.io/unvariance/collector/collector'
	type: string
	pull_request:
	paths:
	- deploy/opentelemetry-demo/values.yaml
	- .github/workflows/benchmark.yaml
	- '.github/actions/setup-k3s/**'
	push:
	branches:
	- main
	paths:
	- deploy/opentelemetry-demo/values.yaml
	- .github/workflows/benchmark.yaml
	- '.github/actions/setup-k3s/**'

	permissions:
	id-token: write # Required for requesting the JWT
	contents: read
	actions: write

	jobs:
	setup-runner:
	name: Start EC2 runner
	runs-on: ubuntu-latest
	outputs:
	runner-label: ${{ steps.start-runner.outputs.runner-label }}
	ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }}
	region: ${{ steps.start-runner.outputs.region }}
	timeout-minutes: ${{ steps.calculate-timeout.outputs.timeout-minutes }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: Calculate timeout
	id: calculate-timeout
	run: \|
	STEADY_STATE_MINUTES=${{ inputs.steady-state-minutes \|\| '1' }}
	TIMEOUT_MINUTES=$((15 + STEADY_STATE_MINUTES))
	echo "timeout-minutes=$TIMEOUT_MINUTES" >> $GITHUB_OUTPUT
	echo "Calculated timeout: $TIMEOUT_MINUTES minutes"

	- name: Start AWS Runner
	id: start-runner
	uses: ./.github/actions/aws-runner
	with:
	github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
	aws-role-arn: ${{ secrets.AWS_ROLE_ARN }}
	iam-role-name: github-actions-runner
	instance-type: ${{ inputs.instance-type \|\| 'm7i.metal-24xl' }}
	image-type: ${{ inputs.image-type \|\| 'ubuntu-22.04' }}
	volume-size: '40'

	run-workload:
	needs: [setup-runner]
	runs-on: ${{ needs.setup-runner.outputs.runner-label }}
	timeout-minutes: ${{ fromJSON(needs.setup-runner.outputs.timeout-minutes) }}
	outputs:
	uuid-prefix: ${{ steps.generate-uuid.outputs.uuid }}
	time-diff: ${{ steps.calculate-time-diff.outputs.time-diff }}
	env:
	RELEASE_NAME: otel-demo
	COLLECTOR_RELEASE_NAME: collector
	S3_BUCKET: "unvariance-collector-test-key-auth"
	AWS_REGION: ${{ secrets.AWS_REGION }}
	HOME: /root
	KUBECONFIG: /etc/rancher/k3s/k3s.yaml
	PIDSTAT_PERIOD: ${{ inputs.pidstat-period \|\| '1' }}
	STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes \|\| '1' }}
	steps:
	- name: Create HOME directory
	run: \|
	mkdir -p $HOME

	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: (k8s) Setup k3s cluster
	uses: ./.github/actions/setup-k3s
	with:
	kubeconfig_path: /etc/rancher/k3s/k3s.yaml
	preflight_inotify: true
	kubelet_max_pods: 400
	disable_packaged_addons: true
	wait_kube_system: true
	timeout_api_server_ready_seconds: 300
	timeout_node_ready_seconds: 300
	timeout_kube_system_each_seconds: 10
	max_retries_kube_system_ready: 10

	- name: (k8s) Install Helm
	run: \|
	curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	- name: (k8s) Add Helm Repositories
	run: \|
	helm repo add unvariance https://unvariance.github.io/collector/charts
	helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
	helm repo update

	- name: (k8s) Clone forked OpenTelemetry Helm Charts
	run: \|
	git clone https://github.com/yonch/opentelemetry-helm-charts.git
	cd opentelemetry-helm-charts
	echo "Using forked OpenTelemetry Helm Charts repository"
	git log --oneline -5

	- name: (k8s) Update OpenTelemetry Demo Dependencies
	run: \|
	cd opentelemetry-helm-charts/charts/opentelemetry-demo
	echo "Updating helm dependencies..."
	helm dep update



	#### Install KEDA Operator
	- name: (k8s) Install KEDA Operator
	run: \|
	helm repo add kedacore https://kedacore.github.io/charts
	helm repo update
	helm install keda kedacore/keda --namespace keda-system --create-namespace

	#### Install OpenTelemetry Demo
	- name: (workload) Install OpenTelemetry Demo
	run: \|
	helm install $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml

	#### Install dependencies
	- name: (install) Compute kernel package names
	id: kernel
	run: \|
	echo "tools=linux-tools-$(uname -r)" >> $GITHUB_OUTPUT

	- name: (install) Install dependencies using apt
	uses: awalsh128/cache-apt-pkgs-action@v1
	with:
	packages: sysstat linux-tools-common linux-tools-generic bzip2 podman ${{ steps.kernel.outputs.tools }}
	version: 1.0

	- name: (install) Verify perf and podman
	run: \|
	perf_version=$(perf --version 2>&1)
	echo "Installed perf: $perf_version"
	podman_version=$(podman --version 2>&1)
	echo "Installed podman: $podman_version"

	- name: (install) Install awscli
	run: \|
	curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
	python3 -m zipfile -e awscliv2.zip .
	chmod u+x ./aws/install
	sudo ./aws/install
	echo ls: `ls -l /usr/local/bin/aws` \|\| true
	chmod +x /usr/local/bin/aws \|\| true
	echo version: `/usr/local/bin/aws --version` \|\| true

	- name: (collector) Pre-fetch Collector Image
	run: \|
	COLLECTOR_IMAGE="${{ inputs.collector-repository \|\| 'ghcr.io/unvariance/collector/collector' }}"
	echo "Pre-fetching collector image: ${COLLECTOR_IMAGE}"
	podman pull "${COLLECTOR_IMAGE}" \|\| true
	echo "Pre-fetch completed"

	#### Wait for services to be ready


	- name: (stabilize) Wait for all Pods to be Ready
	run: \|
	if ! kubectl wait --for=condition=Ready pods --all --timeout=300s; then
	echo "Error: Not all pods reached Ready state within timeout"
	kubectl get pods
	echo "--------------------------------"
	kubectl describe pods
	exit 1
	fi

	#### Show system status
	- name: (status) Get Default objects in kube-system
	run: \|
	kubectl get all -n kube-system

	- name: (status) Print Events
	run: \|
	kubectl get events

	- name: (status) Disk Space Size
	run: \|
	df -h

	- name: (status) Print Pod Status
	run: \|
	kubectl get pods -n default

	- name: (status) Describe Frontend Deployment
	run: \|
	kubectl describe deployment frontend


	#### Deploy Collector

	- name: (collector) Generate UUID Prefix
	id: generate-uuid
	run: \|
	UUID=$(python3 -c "import uuid; print(uuid.uuid4())")
	echo "Using UUID prefix: $UUID"
	echo "uuid=$UUID" >> $GITHUB_OUTPUT

	- name: (collector) Deploy Collector Helm Chart
	env:
	AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
	run: \|
	UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-"

	# Install the helm chart using --set for values
	helm install ${COLLECTOR_RELEASE_NAME} unvariance/collector \
	--set collector.verbose=true \
	--set image.repository="${{ inputs.collector-repository \|\| 'ghcr.io/unvariance/collector/collector' }}" \
	--set storage.type=s3 \
	--set storage.prefix="${UUID_PREFIX}" \
	--set storage.s3.bucket="${S3_BUCKET}" \
	--set storage.s3.region="${AWS_REGION}" \
	--set storage.s3.auth.method=secret \
	--set storage.s3.auth.accessKey="${AWS_ACCESS_KEY_ID}" \
	--set storage.s3.auth.secretKey="${AWS_SECRET_ACCESS_KEY}" \
	--set resources.limits.cpu=1000m \
	--set resources.requests.cpu=1000m \
	--set resources.limits.memory=1000Mi \
	--wait

	- name: (collector) Wait for Collector Pods to be Ready
	run: \|
	kubectl wait --for=condition=Ready pods --timeout=60s -l app.kubernetes.io/name=collector \|\| WAIT_STATUS=$?
	echo "Wait exit status: $WAIT_STATUS"

	# Record timestamp when collector is fully ready
	COLLECTOR_START_TIME=$(date +%s)

	# Always describe pods, regardless of wait result
	echo "Describing collector pods:"
	kubectl describe pods -l app.kubernetes.io/name=collector

	# Only fail the job if wait failed
	if [ -n "$WAIT_STATUS" ] && [ "$WAIT_STATUS" -ne 0 ]; then
	echo "ERROR: Collector pods are not ready after timeout"
	exit 1
	fi

	# Output the timestamp when collector is fully ready
	echo "COLLECTOR_START_TIME=${COLLECTOR_START_TIME}" >> $GITHUB_ENV
	echo "Collector fully ready at timestamp: ${COLLECTOR_START_TIME}"

	- name: (hpa) Check HPA Status After Deployment
	run: \|
	echo "Checking HPA status after deployment but before load generation starts..."
	kubectl get hpa -o wide --all-namespaces \|\| echo "No HPA resources found (wide output)"

	echo "Checking pod scaling status..."
	kubectl get deployment -o wide

	echo "Pod top..."
	kubectl top pods --all-namespaces \|\| echo "No pod metrics available"

	- name: (collector) Show Collector Pod Status
	run: \|
	kubectl get pods -l app.kubernetes.io/name=collector
	kubectl logs -l app.kubernetes.io/name=collector -c collector --tail=10

	#### Start load generator

	- name: (workload) Start Load Generator
	id: calculate-time-diff
	run: \|
	helm upgrade $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml \
	--set components.load-generator.enabled=true

	# Wait for load generator pods to be specifically in Ready state
	echo "Waiting for load generator pods to be ready..."
	kubectl wait --for=condition=Ready pods -l app.kubernetes.io/component=load-generator --timeout=60s \|\| true

	# Record timestamp when load generator is fully ready
	LOAD_GEN_START_TIME=$(date +%s)

	# Calculate the time difference between collector and load generator start
	TIME_DIFF=$((LOAD_GEN_START_TIME - COLLECTOR_START_TIME))
	echo "time-diff=${TIME_DIFF}" >> $GITHUB_OUTPUT
	echo "TIME_DIFF=${TIME_DIFF}" >> $GITHUB_ENV
	echo "Load generator fully ready at timestamp: ${LOAD_GEN_START_TIME}"
	echo "Time difference between collector and load generator ready: ${TIME_DIFF} seconds"

	- name: (workload) Describe Load Generator Deployment
	run: \|
	kubectl describe deployment load-generator


	#### Start PIDs stat collection

	- name: (metrics) Start PIDs stat collection
	id: start-pidstat
	run: \|
	# Create directory for metrics
	mkdir -p /tmp/system_metrics

	# Set file paths
	CPU_METRICS_FILE="/tmp/system_metrics/cpu_metrics.csv"
	MEMORY_METRICS_FILE="/tmp/system_metrics/memory_metrics.csv"

	# Start CPU metrics collection in background
	pidstat -H -u -p ALL ${PIDSTAT_PERIOD} \| awk '{gsub(/^ +\| +$/,""); gsub(/ +/,";"); print}' > ${CPU_METRICS_FILE} 2>&1 &
	CPU_PIDSTAT_PID=$!

	# Start memory metrics collection in background
	pidstat -H -r -p ALL ${PIDSTAT_PERIOD} \| awk '{gsub(/^ +\| +$/,""); gsub(/ +/,";"); print}' > ${MEMORY_METRICS_FILE} 2>&1 &
	MEMORY_PIDSTAT_PID=$!

	# Verify processes are running
	ps -p $CPU_PIDSTAT_PID
	ps -p $MEMORY_PIDSTAT_PID

	# Set environment variables for later use
	echo "CPU_METRICS_FILE=${CPU_METRICS_FILE}" >> $GITHUB_ENV
	echo "MEMORY_METRICS_FILE=${MEMORY_METRICS_FILE}" >> $GITHUB_ENV
	echo "CPU_PIDSTAT_PID=${CPU_PIDSTAT_PID}" >> $GITHUB_ENV
	echo "MEMORY_PIDSTAT_PID=${MEMORY_PIDSTAT_PID}" >> $GITHUB_ENV

	echo "Started PIDs stat collection:"
	echo "CPU metrics PID: ${CPU_PIDSTAT_PID}"
	echo "Memory metrics PID: ${MEMORY_PIDSTAT_PID}"
	echo "CPU metrics file: ${CPU_METRICS_FILE}"
	echo "Memory metrics file: ${MEMORY_METRICS_FILE}"

	#### Main run time
	- name: (metrics) Ramp up load generator
	run: \|
	echo "Ramping up load generator without perf record..."
	sleep 200

	- name: (hpa) Check HPA Status Before Measurements
	run: \|
	echo "Checking HPA status just before measurements with perf..."
	kubectl get hpa -o wide --all-namespaces \|\| echo "No HPA resources found (wide output)"

	echo "Checking pod scaling status..."
	kubectl get deployment -o wide

	echo "Pod top..."
	kubectl top pods --all-namespaces \|\| echo "No pod metrics available"

	- name: (trace) Collect Trace Data
	run: \|
	# Create directory for trace output
	mkdir -p /tmp/trace_data

	TRACE_DURATION="${{ inputs.trace-duration \|\| '10' }}"
	COLLECTOR_IMAGE="${{ inputs.collector-repository \|\| 'ghcr.io/unvariance/collector/collector' }}"

	echo "Starting trace collection for ${TRACE_DURATION} seconds..."

	# Run collector in trace mode using podman with sudo for BPF access
	sudo podman run --rm \
	--privileged \
	--network host \
	--pid host \
	--volume /tmp/trace_data:/tmp/trace_data \
	--volume /sys:/sys \
	--volume /proc:/proc:ro \
	--volume /dev:/dev:ro \
	--user 0 \
	"${COLLECTOR_IMAGE}" \
	--trace \
	--prefix "/tmp/trace_data/trace-" \
	--duration "${TRACE_DURATION}" \
	--parquet-file-size 4000000000 \
	--storage-type local

	echo "Trace collection completed"

	# List trace files
	echo "Generated trace files:"
	ls -la /tmp/trace_data/

	- name: (metrics) Record system performance with perf
	run: \|
	# Create directory for perf data
	mkdir -p /tmp/perf_data

	STEADY_STATE_MINUTES=${STEADY_STATE_MINUTES}
	PERF_RECORD_SECONDS=60

	# Calculate the sleep duration (all minutes except the last one)
	if [[ $STEADY_STATE_MINUTES -gt 1 ]]; then
	SLEEP_MINUTES=$((STEADY_STATE_MINUTES - 1))
	SLEEP_SECONDS=$((SLEEP_MINUTES * 60))

	echo "Sleeping for $SLEEP_MINUTES minutes ($SLEEP_SECONDS seconds) before perf recording..."
	sleep $SLEEP_SECONDS
	fi

	echo "Starting perf record for $PERF_RECORD_SECONDS seconds at 19Hz frequency..."
	perf record -a -g -F 19 --call-graph dwarf -o /tmp/perf_data/perf.data sleep $PERF_RECORD_SECONDS

	# Create directory for perf results
	mkdir -p perf_results

	# List the perf files
	echo "Perf data files:"
	ls -la /tmp/perf_data/

	- name: (metrics) Stop PIDs stat collection
	run: \|
	echo "Stopping PIDs stat collection processes"
	kill -TERM $CPU_PIDSTAT_PID \|\| true
	kill -TERM $MEMORY_PIDSTAT_PID \|\| true

	# Wait a moment to ensure files are fully written
	sleep 2

	# Check if files exist and have content
	echo "CPU metrics file size: $(stat -c %s $CPU_METRICS_FILE \|\| echo 'file not found')"
	echo "Memory metrics file size: $(stat -c %s $MEMORY_METRICS_FILE \|\| echo 'file not found')"

	# Create directory for system metrics
	mkdir -p system_metrics

	# Copy the files to the upload directory
	cp $CPU_METRICS_FILE system_metrics/cpu_metrics.csv \|\| echo "Failed to copy CPU metrics file"
	cp $MEMORY_METRICS_FILE system_metrics/memory_metrics.csv \|\| echo "Failed to copy memory metrics file"

	# List the files
	echo "System metrics files:"
	ls -la system_metrics/

	- name: (metrics) Extract Locust CSV files
	run: \|
	# Create directory for the results
	mkdir -p locust_results

	# Get the load-generator pod name
	LOADGEN_POD=$(kubectl get pods -l app.kubernetes.io/component=load-generator -o name \| cut -d/ -f2)
	echo "Load Generator Pod: $LOADGEN_POD"

	# Copy CSV files from the pod
	echo "Copying CSV files from pod..."
	kubectl cp $LOADGEN_POD:/tmp/locust_results/ ./locust_results/

	# List the extracted files
	echo "Extracted CSV files:"
	ls -la locust_results/

	- name: (metrics) Upload Locust Results as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: locust-csv-results
	path: locust_results/
	retention-days: 28

	- name: (metrics) Upload System Metrics as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: system-metrics-results
	path: system_metrics/
	retention-days: 28

	- name: (collector) Send SIGINT to Collector
	run: \|
	kubectl exec -n default $(kubectl get pods -l app.kubernetes.io/name=collector -o name \| cut -d/ -f2) -- /bin/sh -c "kill -INT 1"
	sleep 10

	- name: (collector) Print Collector Logs
	run: \|
	kubectl logs -l app.kubernetes.io/name=collector \|\| true

	- name: (collector) Uninstall Collector Helm Chart
	run: \|
	# see the logs while uninstalling
	kubectl logs -f --tail=-1 -l app.kubernetes.io/name=collector \|\| true &
	LOGS_PID=$!

	# wait for the pods to be deleted, in the background
	kubectl wait --for=delete pods -l app.kubernetes.io/name=collector --timeout=45s \|\| true &
	WAIT_PID=$!

	# uninstall the chart
	helm uninstall ${COLLECTOR_RELEASE_NAME}

	# wait for the pods to be deleted
	wait $WAIT_PID \|\| true

	# kill the logs
	kill -TERM $LOGS_PID \|\| true

	- name: (collector) Get Collector Parquet File
	env:
	AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
	run: \|
	# Get UUID prefix from the output
	UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-"

	# List files with the UUID prefix
	echo "Checking for files with prefix ${UUID_PREFIX} in S3 bucket ${S3_BUCKET}"
	S3_FILES=$(aws s3 ls "s3://${S3_BUCKET}/${UUID_PREFIX}" --recursive \|\| echo "")

	if [ -z "$S3_FILES" ]; then
	echo "No files found with prefix ${UUID_PREFIX} in bucket ${S3_BUCKET}"
	exit 1
	else
	echo "Found files with prefix ${UUID_PREFIX}:"
	echo "$S3_FILES"

	# Get the file path
	PARQUET_FILE=$(echo "$S3_FILES" \| head -n 1 \| awk '{print $4}')

	# Download the parquet file for validation
	aws s3 cp "s3://${S3_BUCKET}/${PARQUET_FILE}" /tmp/collector-parquet.parquet

	# Check file size
	FILE_SIZE=$(stat -c %s /tmp/collector-parquet.parquet)
	echo "Downloaded collector file size: ${FILE_SIZE} bytes"
	fi

	- name: (collector) Upload Collector Results as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: collector-parquet-results
	path: /tmp/collector-parquet.parquet
	retention-days: 28

	- name: (trace) Upload Trace Results as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: trace-parquet-results
	path: /tmp/trace_data/
	retention-days: 28
	if: always()

	- name: (status) Print Events
	run: \|
	kubectl get events

	- name: (status) Print Pod Status
	run: \|
	kubectl get pods -n default

	- name: (hpa) Check HPA Status At End of Benchmark
	run: \|
	echo "Checking HPA status at the end of the benchmark run..."
	kubectl get hpa -o wide --all-namespaces \|\| echo "No HPA resources found (wide output)"

	echo "Final deployment scaling status..."
	kubectl get deployment -o wide

	echo "Pod top..."
	kubectl top pods --all-namespaces \|\| echo "No pod metrics available"

	- name: (opentelemetry-collector) Collect Internal Metrics File
	run: \|
	# Create directory for internal metrics
	mkdir -p otel_internal_metrics

	# Get the OpenTelemetry collector pod name
	OTEL_POD=$(kubectl get pods -l app.kubernetes.io/name=otel-collector -o name \| cut -d/ -f2)
	echo "OpenTelemetry Collector Pod: $OTEL_POD"

	# Check if the internal metrics file exists and copy it
	echo "Checking for internal metrics file..."
	if kubectl exec $OTEL_POD -- test -f /tmp/internal-metrics/otel-collector-internal-metrics.jsonl; then
	echo "Internal metrics file found, copying..."
	kubectl cp $OTEL_POD:/tmp/internal-metrics/otel-collector-internal-metrics.jsonl ./otel_internal_metrics/otel-collector-internal-metrics.jsonl

	# Check file size and show first few lines
	if [ -f ./otel_internal_metrics/otel-collector-internal-metrics.jsonl ]; then
	FILE_SIZE=$(stat -c %s ./otel_internal_metrics/otel-collector-internal-metrics.jsonl)
	echo "Internal metrics file size: ${FILE_SIZE} bytes"
	echo "First 5 lines of internal metrics file:"
	head -n 5 ./otel_internal_metrics/otel-collector-internal-metrics.jsonl \|\| true
	else
	echo "Failed to copy internal metrics file"
	fi
	else
	echo "Internal metrics file not found in pod"
	kubectl exec $OTEL_POD -- ls -la /tmp/internal-metrics/ \|\| echo "Directory /tmp/internal-metrics/ not found"
	fi

	# List the extracted files
	echo "Internal metrics files:"
	ls -la otel_internal_metrics/ \|\| echo "No internal metrics files found"

	- name: (opentelemetry-collector) Upload Internal Metrics as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: otel-internal-metrics
	path: otel_internal_metrics/
	retention-days: 28
	if: always()

	generate-flamegraphs:
	name: Generate Flamegraphs
	needs: [setup-runner, run-workload]
	runs-on: ${{ needs.setup-runner.outputs.runner-label }}
	timeout-minutes: 10
	env:
	HOME: /root
	steps:
	- name: Create HOME directory
	run: \|
	mkdir -p $HOME

	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: (install) Install build tools
	uses: awalsh128/cache-apt-pkgs-action@v1
	with:
	packages: build-essential
	version: 1.0

	- name: (install) Install Rust and addr2line
	run: \|
	# Install Rust
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y
	source "$HOME/.cargo/env"

	# Verify Rust installation
	rustc --version
	cargo --version

	# Install addr2line from crates.io
	cargo install addr2line --features="bin"

	# Check if the system addr2line exists and back it up if it does
	if [ -f /usr/bin/addr2line ]; then
	sudo mv /usr/bin/addr2line /usr/bin/addr2line.bak
	echo "Backed up original addr2line to /usr/bin/addr2line.bak"
	fi

	# Copy the Rust addr2line to replace the system version
	sudo cp "$HOME/.cargo/bin/addr2line" /usr/bin/addr2line
	sudo chmod +x /usr/bin/addr2line

	# Verify the replacement
	addr2line --version
	echo "Replaced system addr2line with Rust implementation"

	# Install Inferno for flamegraph generation
	cargo install inferno

	# Verify Inferno installation
	inferno-flamegraph --help

	- name: (metrics) Generate Perf Report and Flamegraphs
	run: \|
	# so we can use the inferno binaries
	source "$HOME/.cargo/env"

	mkdir -p flamegraph_results perf_results

	# Generate a report summary in text format
	echo "Generating perf report - this may take some time..."
	perf report --stdio -i /tmp/perf_data/perf.data > perf_results/perf_report.txt \|\| true

	# Generate collapsed stack traces (folded)
	echo "Generating folded stacks from perf data..."
	perf script -i /tmp/perf_data/perf.data \| inferno-collapse-perf > flamegraph_results/stacks.folded

	# Generate flamegraphs with different color schemes
	echo "Generating flamegraphs from folded stacks..."
	cat flamegraph_results/stacks.folded \| inferno-flamegraph > flamegraph_results/flamegraph.svg

	# List the perf and flamegraph files
	echo "Perf results:"
	ls -la perf_results/
	echo "Flamegraph results:"
	ls -la flamegraph_results/

	- name: (metrics) Upload Perf and Flamegraph Results as Artifacts
	uses: actions/upload-artifact@v4
	with:
	name: performance-analysis
	path: \|
	perf_results/
	flamegraph_results/
	retention-days: 28

	generate-visualizations:
	name: Generate Performance Visualizations
	needs: [run-workload]
	runs-on: ubuntu-latest
	env:
	STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes \|\| '1' }}
	TIME_DIFF: ${{ needs.run-workload.outputs.time-diff \|\| '0' }}
	container:
	image: rocker/tidyverse:latest
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: Install R Dependencies
	run: \|
	# Install only the additional packages needed (tidyverse is already included in base image)
	R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')"

	- name: Download System Metrics Artifacts
	uses: actions/download-artifact@v4
	with:
	name: system-metrics-results
	path: system_metrics

	- name: Download Locust Results
	uses: actions/download-artifact@v4
	with:
	name: locust-csv-results
	path: locust_results

	- name: Download Collector Parquet
	uses: actions/download-artifact@v4
	with:
	name: collector-parquet-results
	path: collector_data

	- name: List Downloaded Files
	run: \|
	echo "System Metrics Files:"
	ls -la system_metrics/

	echo "Locust Results:"
	ls -la locust_results/

	echo "Collector Data:"
	ls -la collector_data/

	- name: Convert CPU Metrics Format
	run: \|
	# Create directory for converted metrics
	mkdir -p converted_metrics

	# Convert CPU metrics
	bash scripts/convert_cpu_metrics.sh system_metrics/cpu_metrics.csv converted_metrics/cpu_metrics.csv

	# Check output
	echo "Converted CPU Metrics:"
	head -n 5 converted_metrics/cpu_metrics.csv

	- name: Generate Memory Utilization Plots
	run: \|
	mkdir -p visualization_results

	# Generate memory utilization plots for collector process
	Rscript scripts/plot_memory_utilization.R system_metrics/memory_metrics.csv collector visualization_results/memory_utilization

	# Check output
	ls -la visualization_results/

	- name: Generate CPU Utilization Plots
	run: \|
	# Generate CPU utilization plots
	Rscript scripts/plot_cpu_utilization.R converted_metrics/cpu_metrics.csv collector visualization_results/cpu_utilization

	# Check output
	ls -la visualization_results/


	- name: Generate Workload Performance Plots
	run: \|
	# Find the stats_history file
	STATS_FILE=$(find locust_results -name "*stats_history.csv" \| head -1)

	if [ -n "$STATS_FILE" ]; then
	echo "Using Locust stats file: $STATS_FILE"
	# Generate workload performance plots
	Rscript scripts/plot_workload_performance.R $STATS_FILE visualization_results/workload_performance
	else
	echo "No stats_history.csv file found in locust_results directory"
	ls -la locust_results/
	exit 1
	fi

	# Check output
	ls -la visualization_results/

	- name: Generate CPI by LLC Misses Plots
	run: \|
	# Calculate the time window based on steady state minutes and the time difference between
	# when collector is ready and when load generator is ready
	TIME_DIFF=${TIME_DIFF:-0}

	# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
	# End: End 5 seconds before the steady state ends
	START_TIME=$((205 + TIME_DIFF))
	STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
	END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))

	# Calculate an alternative end time capped at 180 seconds from start
	END_TIME_CAPPED=$((START_TIME + 180))

	# Use the earlier of the two end times for the main plot
	if [ "$END_TIME_CAPPED" -lt "$END_TIME" ]; then
	echo "Using capped end time (180s from start): $START_TIME - $END_TIME_CAPPED seconds (steady state: ${STEADY_STATE_MINUTES} min)"
	echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"

	# Generate an additional plot with the original end time
	Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses_capped 23 $START_TIME $END_TIME_CAPPED
	else
	echo "No need to cap the length: using steady state end time"
	fi

	echo "Using time window: $START_TIME - $END_TIME seconds (steady state: ${STEADY_STATE_MINUTES} min)"
	echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"

	# Generate CPI by LLC misses plots with time window parameters
	Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses 23 $START_TIME $END_TIME \|\| true

	- name: Generate Contention Analysis Plots
	run: \|
	# Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds
	TIME_DIFF=${TIME_DIFF:-0}

	# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
	# End: End 5 seconds before the steady state ends
	START_TIME=$((205 + TIME_DIFF))
	STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
	END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))

	# Cap the window duration at 60 seconds
	WINDOW_DURATION=$((END_TIME - START_TIME))
	if [ "$WINDOW_DURATION" -gt 60 ]; then
	WINDOW_DURATION=60
	END_TIME=$((START_TIME + 60))
	fi

	echo "Using time window for contention analysis: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)"
	echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"

	# Generate contention analysis plots with calculated time window
	Rscript scripts/plot_contention_analysis.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/contention_analysis 18 0.2 $END_TIME \|\| true

	# Check output
	ls -la visualization_results/contention_analysis_* \|\| echo "No contention analysis plots generated"

	- name: Generate Instructions vs CPI Plots
	run: \|
	# Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds
	TIME_DIFF=${TIME_DIFF:-0}

	# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
	# End: End 5 seconds before the steady state ends
	START_TIME=$((205 + TIME_DIFF))
	STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
	END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))

	# Cap the window duration at 60 seconds
	WINDOW_DURATION=$((END_TIME - START_TIME))
	if [ "$WINDOW_DURATION" -gt 60 ]; then
	WINDOW_DURATION=60
	END_TIME=$((START_TIME + 60))
	fi

	echo "Using time window for instructions vs CPI: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)"
	echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"

	# Generate Instructions vs CPI scatter plot with calculated time window
	Rscript scripts/plot_instructions_vs_cpi.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/instructions_vs_cpi_steady_state 18 $END_TIME \|\| true

	# Check output
	ls -la visualization_results/instructions_vs_cpi_* \|\| echo "No instructions vs CPI plots generated"

	- name: Upload Visualization Results
	uses: actions/upload-artifact@v4
	with:
	name: performance-visualizations
	path: visualization_results/
	retention-days: 28

	analyze-parquet:
	name: Analyze Parquet Data
	needs: [run-workload]
	runs-on: ubuntu-latest
	strategy:
	matrix:
	data-type: [timeslot, trace]
	steps:
	- name: Download parquet artifact
	uses: actions/download-artifact@v4
	with:
	name: ${{ matrix.data-type == 'timeslot' && 'collector-parquet-results' \|\| 'trace-parquet-results' }}
	path: parquet-data

	- name: Install pqrs
	run: \|
	curl -L -o pqrs.zip https://github.com/manojkarthick/pqrs/releases/download/v0.3.2/pqrs-0.3.2-x86_64-unknown-linux-gnu.zip
	python3 -m zipfile -e pqrs.zip .
	sudo mv pqrs-0.3.2-x86_64-unknown-linux-gnu/bin/pqrs /usr/local/bin/
	sudo chmod +x /usr/local/bin/pqrs
	rm -rf pqrs.zip pqrs-0.3.2-x86_64-unknown-linux-gnu
	pqrs --version

	- name: Analyze parquet file
	run: \|
	mkdir -p parquet-analysis

	PARQUET_FILE=$(find parquet-data -name "*.parquet" -type f \| head -1)
	echo "Found parquet file: $PARQUET_FILE"

	if [ -z "$PARQUET_FILE" ]; then
	echo "No parquet files found for ${{ matrix.data-type }} data"
	exit 0
	fi

	# Generate simple schema (non-detailed)
	echo "Generating simple schema..."
	pqrs schema $PARQUET_FILE > parquet-analysis/schema.txt

	# Generate detailed schema
	echo "Generating detailed schema..."
	pqrs schema --detailed $PARQUET_FILE > parquet-analysis/schema-detailed.txt

	# Generate sample records (100)
	echo "Generating 100 sample records..."
	pqrs sample --records 100 $PARQUET_FILE > parquet-analysis/sample-100.txt

	# Generate head records (first 100)
	echo "Generating first 100 records..."
	pqrs head --records 100 $PARQUET_FILE > parquet-analysis/head-100.txt

	# Generate JSON versions too
	echo "Generating JSON formatted outputs..."
	pqrs schema --json $PARQUET_FILE > parquet-analysis/schema.json
	pqrs sample --json --records 100 $PARQUET_FILE > parquet-analysis/sample-100.json
	pqrs head --json --records 100 $PARQUET_FILE > parquet-analysis/head-100.json

	# List the generated files
	echo "Generated analysis files:"
	ls -la parquet-analysis/

	- name: Upload parquet analysis results
	uses: actions/upload-artifact@v4
	with:
	name: parquet-analysis-${{ matrix.data-type }}-results
	path: parquet-analysis/
	retention-days: 28

	generate-memory-usage-plots:
	name: Generate Memory Usage Plots
	needs: [run-workload]
	runs-on: ubuntu-latest
	env:
	TIME_DIFF: ${{ needs.run-workload.outputs.time-diff \|\| '0' }}
	container:
	image: rocker/tidyverse:latest
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: Install R Dependencies
	run: \|
	# Install only the additional packages needed (tidyverse is already included in base image)
	R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')"

	- name: Download Collector Parquet
	uses: actions/download-artifact@v4
	with:
	name: collector-parquet-results
	path: collector_data

	- name: Generate Memory Usage Plots
	run: \|
	mkdir -p memory_usage_results

	# The collector is ready TIME_DIFF seconds before the load generator is ready
	TIME_DIFF=${TIME_DIFF:-0}

	# Calculate adjusted start times relative to when collector is ready
	START_TIME_1=$((20 + TIME_DIFF))
	START_TIME_2=$((180 + TIME_DIFF))

	echo "Using adjusted start times: $START_TIME_1 and $START_TIME_2 seconds after collector is ready"
	echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"

	# Generate memory usage plots (LLC misses and cache references) with the adjusted start times
	for i in 0.3 0.5 1 3 10; do
	Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_1 $i memory_usage_results/memory_usage_20sec_$i \|\| true
	Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_2 $i memory_usage_results/memory_usage_180sec_$i \|\| true
	done

	# Check output
	ls -la memory_usage_results/

	- name: Upload Memory Usage Plots
	uses: actions/upload-artifact@v4
	with:
	name: memory-usage-plots
	path: memory_usage_results/
	retention-days: 28

	stop-runner:
	name: Stop EC2 runner
	needs: [setup-runner, run-workload, generate-flamegraphs] # Now depends on the flamegraph job
	runs-on: ubuntu-latest
	if: always() # Run even if previous jobs fail
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha \|\| github.sha }}

	- name: Stop AWS Runner
	uses: ./.github/actions/aws-runner/cleanup
	with:
	runner-label: ${{ needs.setup-runner.outputs.runner-label }}
	ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }}
	github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
	aws-role-arn: ${{ secrets.AWS_ROLE_ARN }}
	aws-region: ${{ needs.setup-runner.outputs.region \|\| secrets.AWS_REGION }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

feat(nri-resctrl-plugin): add integration tests #38

Workflow file

feat(nri-resctrl-plugin): add integration tests #38

Uh oh!

Workflow file for this run