Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat(nri-resctrl-plugin): add integration tests #38

feat(nri-resctrl-plugin): add integration tests

feat(nri-resctrl-plugin): add integration tests #38

Workflow file for this run

name: benchmark
on:
workflow_dispatch: # Manual trigger for testing
inputs:
instance-type:
description: 'EC2 instance type to use'
required: false
default: 'm7i.metal-24xl'
type: string
image-type:
description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)'
required: false
default: 'ubuntu-22.04'
type: string
pidstat-period:
description: 'Collection frequency for pidstat in seconds'
required: false
default: '1'
type: string
steady-state-minutes:
description: 'How long to run the steady-state workload in minutes'
required: false
default: '1'
type: string
trace-duration:
description: 'Duration for trace collection in seconds'
required: false
default: '10'
type: string
collector-repository:
description: 'Collector image repository to use'
required: false
default: 'ghcr.io/unvariance/collector/collector'
type: string
pull_request:
paths:
- deploy/opentelemetry-demo/values.yaml
- .github/workflows/benchmark.yaml
- '.github/actions/setup-k3s/**'
push:
branches:
- main
paths:
- deploy/opentelemetry-demo/values.yaml
- .github/workflows/benchmark.yaml
- '.github/actions/setup-k3s/**'
permissions:
id-token: write # Required for requesting the JWT
contents: read
actions: write
jobs:
setup-runner:
name: Start EC2 runner
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.start-runner.outputs.runner-label }}
ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }}
region: ${{ steps.start-runner.outputs.region }}
timeout-minutes: ${{ steps.calculate-timeout.outputs.timeout-minutes }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Calculate timeout
id: calculate-timeout
run: |
STEADY_STATE_MINUTES=${{ inputs.steady-state-minutes || '1' }}
TIMEOUT_MINUTES=$((15 + STEADY_STATE_MINUTES))
echo "timeout-minutes=$TIMEOUT_MINUTES" >> $GITHUB_OUTPUT
echo "Calculated timeout: $TIMEOUT_MINUTES minutes"
- name: Start AWS Runner
id: start-runner
uses: ./.github/actions/aws-runner
with:
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
aws-role-arn: ${{ secrets.AWS_ROLE_ARN }}
iam-role-name: github-actions-runner
instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }}
image-type: ${{ inputs.image-type || 'ubuntu-22.04' }}
volume-size: '40'
run-workload:
needs: [setup-runner]
runs-on: ${{ needs.setup-runner.outputs.runner-label }}
timeout-minutes: ${{ fromJSON(needs.setup-runner.outputs.timeout-minutes) }}
outputs:
uuid-prefix: ${{ steps.generate-uuid.outputs.uuid }}
time-diff: ${{ steps.calculate-time-diff.outputs.time-diff }}
env:
RELEASE_NAME: otel-demo
COLLECTOR_RELEASE_NAME: collector
S3_BUCKET: "unvariance-collector-test-key-auth"
AWS_REGION: ${{ secrets.AWS_REGION }}
HOME: /root
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
PIDSTAT_PERIOD: ${{ inputs.pidstat-period || '1' }}
STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes || '1' }}
steps:
- name: Create HOME directory
run: |
mkdir -p $HOME
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: (k8s) Setup k3s cluster
uses: ./.github/actions/setup-k3s
with:
kubeconfig_path: /etc/rancher/k3s/k3s.yaml
preflight_inotify: true
kubelet_max_pods: 400
disable_packaged_addons: true
wait_kube_system: true
timeout_api_server_ready_seconds: 300
timeout_node_ready_seconds: 300
timeout_kube_system_each_seconds: 10
max_retries_kube_system_ready: 10
- name: (k8s) Install Helm
run: |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: (k8s) Add Helm Repositories
run: |
helm repo add unvariance https://unvariance.github.io/collector/charts
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm repo update
- name: (k8s) Clone forked OpenTelemetry Helm Charts
run: |
git clone https://github.com/yonch/opentelemetry-helm-charts.git
cd opentelemetry-helm-charts
echo "Using forked OpenTelemetry Helm Charts repository"
git log --oneline -5
- name: (k8s) Update OpenTelemetry Demo Dependencies
run: |
cd opentelemetry-helm-charts/charts/opentelemetry-demo
echo "Updating helm dependencies..."
helm dep update
#### Install KEDA Operator
- name: (k8s) Install KEDA Operator
run: |
helm repo add kedacore https://kedacore.github.io/charts
helm repo update
helm install keda kedacore/keda --namespace keda-system --create-namespace
#### Install OpenTelemetry Demo
- name: (workload) Install OpenTelemetry Demo
run: |
helm install $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml
#### Install dependencies
- name: (install) Compute kernel package names
id: kernel
run: |
echo "tools=linux-tools-$(uname -r)" >> $GITHUB_OUTPUT
- name: (install) Install dependencies using apt
uses: awalsh128/cache-apt-pkgs-action@v1
with:
packages: sysstat linux-tools-common linux-tools-generic bzip2 podman ${{ steps.kernel.outputs.tools }}
version: 1.0
- name: (install) Verify perf and podman
run: |
perf_version=$(perf --version 2>&1)
echo "Installed perf: $perf_version"
podman_version=$(podman --version 2>&1)
echo "Installed podman: $podman_version"
- name: (install) Install awscli
run: |
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
python3 -m zipfile -e awscliv2.zip .
chmod u+x ./aws/install
sudo ./aws/install
echo ls: `ls -l /usr/local/bin/aws` || true
chmod +x /usr/local/bin/aws || true
echo version: `/usr/local/bin/aws --version` || true
- name: (collector) Pre-fetch Collector Image
run: |
COLLECTOR_IMAGE="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}"
echo "Pre-fetching collector image: ${COLLECTOR_IMAGE}"
podman pull "${COLLECTOR_IMAGE}" || true
echo "Pre-fetch completed"
#### Wait for services to be ready
- name: (stabilize) Wait for all Pods to be Ready
run: |
if ! kubectl wait --for=condition=Ready pods --all --timeout=300s; then
echo "Error: Not all pods reached Ready state within timeout"
kubectl get pods
echo "--------------------------------"
kubectl describe pods
exit 1
fi
#### Show system status
- name: (status) Get Default objects in kube-system
run: |
kubectl get all -n kube-system
- name: (status) Print Events
run: |
kubectl get events
- name: (status) Disk Space Size
run: |
df -h
- name: (status) Print Pod Status
run: |
kubectl get pods -n default
- name: (status) Describe Frontend Deployment
run: |
kubectl describe deployment frontend
#### Deploy Collector
- name: (collector) Generate UUID Prefix
id: generate-uuid
run: |
UUID=$(python3 -c "import uuid; print(uuid.uuid4())")
echo "Using UUID prefix: $UUID"
echo "uuid=$UUID" >> $GITHUB_OUTPUT
- name: (collector) Deploy Collector Helm Chart
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
run: |
UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-"
# Install the helm chart using --set for values
helm install ${COLLECTOR_RELEASE_NAME} unvariance/collector \
--set collector.verbose=true \
--set image.repository="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}" \
--set storage.type=s3 \
--set storage.prefix="${UUID_PREFIX}" \
--set storage.s3.bucket="${S3_BUCKET}" \
--set storage.s3.region="${AWS_REGION}" \
--set storage.s3.auth.method=secret \
--set storage.s3.auth.accessKey="${AWS_ACCESS_KEY_ID}" \
--set storage.s3.auth.secretKey="${AWS_SECRET_ACCESS_KEY}" \
--set resources.limits.cpu=1000m \
--set resources.requests.cpu=1000m \
--set resources.limits.memory=1000Mi \
--wait
- name: (collector) Wait for Collector Pods to be Ready
run: |
kubectl wait --for=condition=Ready pods --timeout=60s -l app.kubernetes.io/name=collector || WAIT_STATUS=$?
echo "Wait exit status: $WAIT_STATUS"
# Record timestamp when collector is fully ready
COLLECTOR_START_TIME=$(date +%s)
# Always describe pods, regardless of wait result
echo "Describing collector pods:"
kubectl describe pods -l app.kubernetes.io/name=collector
# Only fail the job if wait failed
if [ -n "$WAIT_STATUS" ] && [ "$WAIT_STATUS" -ne 0 ]; then
echo "ERROR: Collector pods are not ready after timeout"
exit 1
fi
# Output the timestamp when collector is fully ready
echo "COLLECTOR_START_TIME=${COLLECTOR_START_TIME}" >> $GITHUB_ENV
echo "Collector fully ready at timestamp: ${COLLECTOR_START_TIME}"
- name: (hpa) Check HPA Status After Deployment
run: |
echo "Checking HPA status after deployment but before load generation starts..."
kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)"
echo "Checking pod scaling status..."
kubectl get deployment -o wide
echo "Pod top..."
kubectl top pods --all-namespaces || echo "No pod metrics available"
- name: (collector) Show Collector Pod Status
run: |
kubectl get pods -l app.kubernetes.io/name=collector
kubectl logs -l app.kubernetes.io/name=collector -c collector --tail=10
#### Start load generator
- name: (workload) Start Load Generator
id: calculate-time-diff
run: |
helm upgrade $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml \
--set components.load-generator.enabled=true
# Wait for load generator pods to be specifically in Ready state
echo "Waiting for load generator pods to be ready..."
kubectl wait --for=condition=Ready pods -l app.kubernetes.io/component=load-generator --timeout=60s || true
# Record timestamp when load generator is fully ready
LOAD_GEN_START_TIME=$(date +%s)
# Calculate the time difference between collector and load generator start
TIME_DIFF=$((LOAD_GEN_START_TIME - COLLECTOR_START_TIME))
echo "time-diff=${TIME_DIFF}" >> $GITHUB_OUTPUT
echo "TIME_DIFF=${TIME_DIFF}" >> $GITHUB_ENV
echo "Load generator fully ready at timestamp: ${LOAD_GEN_START_TIME}"
echo "Time difference between collector and load generator ready: ${TIME_DIFF} seconds"
- name: (workload) Describe Load Generator Deployment
run: |
kubectl describe deployment load-generator
#### Start PIDs stat collection
- name: (metrics) Start PIDs stat collection
id: start-pidstat
run: |
# Create directory for metrics
mkdir -p /tmp/system_metrics
# Set file paths
CPU_METRICS_FILE="/tmp/system_metrics/cpu_metrics.csv"
MEMORY_METRICS_FILE="/tmp/system_metrics/memory_metrics.csv"
# Start CPU metrics collection in background
pidstat -H -u -p ALL ${PIDSTAT_PERIOD} | awk '{gsub(/^ +| +$/,""); gsub(/ +/,";"); print}' > ${CPU_METRICS_FILE} 2>&1 &
CPU_PIDSTAT_PID=$!
# Start memory metrics collection in background
pidstat -H -r -p ALL ${PIDSTAT_PERIOD} | awk '{gsub(/^ +| +$/,""); gsub(/ +/,";"); print}' > ${MEMORY_METRICS_FILE} 2>&1 &
MEMORY_PIDSTAT_PID=$!
# Verify processes are running
ps -p $CPU_PIDSTAT_PID
ps -p $MEMORY_PIDSTAT_PID
# Set environment variables for later use
echo "CPU_METRICS_FILE=${CPU_METRICS_FILE}" >> $GITHUB_ENV
echo "MEMORY_METRICS_FILE=${MEMORY_METRICS_FILE}" >> $GITHUB_ENV
echo "CPU_PIDSTAT_PID=${CPU_PIDSTAT_PID}" >> $GITHUB_ENV
echo "MEMORY_PIDSTAT_PID=${MEMORY_PIDSTAT_PID}" >> $GITHUB_ENV
echo "Started PIDs stat collection:"
echo "CPU metrics PID: ${CPU_PIDSTAT_PID}"
echo "Memory metrics PID: ${MEMORY_PIDSTAT_PID}"
echo "CPU metrics file: ${CPU_METRICS_FILE}"
echo "Memory metrics file: ${MEMORY_METRICS_FILE}"
#### Main run time
- name: (metrics) Ramp up load generator
run: |
echo "Ramping up load generator without perf record..."
sleep 200
- name: (hpa) Check HPA Status Before Measurements
run: |
echo "Checking HPA status just before measurements with perf..."
kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)"
echo "Checking pod scaling status..."
kubectl get deployment -o wide
echo "Pod top..."
kubectl top pods --all-namespaces || echo "No pod metrics available"
- name: (trace) Collect Trace Data
run: |
# Create directory for trace output
mkdir -p /tmp/trace_data
TRACE_DURATION="${{ inputs.trace-duration || '10' }}"
COLLECTOR_IMAGE="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}"
echo "Starting trace collection for ${TRACE_DURATION} seconds..."
# Run collector in trace mode using podman with sudo for BPF access
sudo podman run --rm \
--privileged \
--network host \
--pid host \
--volume /tmp/trace_data:/tmp/trace_data \
--volume /sys:/sys \
--volume /proc:/proc:ro \
--volume /dev:/dev:ro \
--user 0 \
"${COLLECTOR_IMAGE}" \
--trace \
--prefix "/tmp/trace_data/trace-" \
--duration "${TRACE_DURATION}" \
--parquet-file-size 4000000000 \
--storage-type local
echo "Trace collection completed"
# List trace files
echo "Generated trace files:"
ls -la /tmp/trace_data/
- name: (metrics) Record system performance with perf
run: |
# Create directory for perf data
mkdir -p /tmp/perf_data
STEADY_STATE_MINUTES=${STEADY_STATE_MINUTES}
PERF_RECORD_SECONDS=60
# Calculate the sleep duration (all minutes except the last one)
if [[ $STEADY_STATE_MINUTES -gt 1 ]]; then
SLEEP_MINUTES=$((STEADY_STATE_MINUTES - 1))
SLEEP_SECONDS=$((SLEEP_MINUTES * 60))
echo "Sleeping for $SLEEP_MINUTES minutes ($SLEEP_SECONDS seconds) before perf recording..."
sleep $SLEEP_SECONDS
fi
echo "Starting perf record for $PERF_RECORD_SECONDS seconds at 19Hz frequency..."
perf record -a -g -F 19 --call-graph dwarf -o /tmp/perf_data/perf.data sleep $PERF_RECORD_SECONDS
# Create directory for perf results
mkdir -p perf_results
# List the perf files
echo "Perf data files:"
ls -la /tmp/perf_data/
- name: (metrics) Stop PIDs stat collection
run: |
echo "Stopping PIDs stat collection processes"
kill -TERM $CPU_PIDSTAT_PID || true
kill -TERM $MEMORY_PIDSTAT_PID || true
# Wait a moment to ensure files are fully written
sleep 2
# Check if files exist and have content
echo "CPU metrics file size: $(stat -c %s $CPU_METRICS_FILE || echo 'file not found')"
echo "Memory metrics file size: $(stat -c %s $MEMORY_METRICS_FILE || echo 'file not found')"
# Create directory for system metrics
mkdir -p system_metrics
# Copy the files to the upload directory
cp $CPU_METRICS_FILE system_metrics/cpu_metrics.csv || echo "Failed to copy CPU metrics file"
cp $MEMORY_METRICS_FILE system_metrics/memory_metrics.csv || echo "Failed to copy memory metrics file"
# List the files
echo "System metrics files:"
ls -la system_metrics/
- name: (metrics) Extract Locust CSV files
run: |
# Create directory for the results
mkdir -p locust_results
# Get the load-generator pod name
LOADGEN_POD=$(kubectl get pods -l app.kubernetes.io/component=load-generator -o name | cut -d/ -f2)
echo "Load Generator Pod: $LOADGEN_POD"
# Copy CSV files from the pod
echo "Copying CSV files from pod..."
kubectl cp $LOADGEN_POD:/tmp/locust_results/ ./locust_results/
# List the extracted files
echo "Extracted CSV files:"
ls -la locust_results/
- name: (metrics) Upload Locust Results as Artifacts
uses: actions/upload-artifact@v4
with:
name: locust-csv-results
path: locust_results/
retention-days: 28
- name: (metrics) Upload System Metrics as Artifacts
uses: actions/upload-artifact@v4
with:
name: system-metrics-results
path: system_metrics/
retention-days: 28
- name: (collector) Send SIGINT to Collector
run: |
kubectl exec -n default $(kubectl get pods -l app.kubernetes.io/name=collector -o name | cut -d/ -f2) -- /bin/sh -c "kill -INT 1"
sleep 10
- name: (collector) Print Collector Logs
run: |
kubectl logs -l app.kubernetes.io/name=collector || true
- name: (collector) Uninstall Collector Helm Chart
run: |
# see the logs while uninstalling
kubectl logs -f --tail=-1 -l app.kubernetes.io/name=collector || true &
LOGS_PID=$!
# wait for the pods to be deleted, in the background
kubectl wait --for=delete pods -l app.kubernetes.io/name=collector --timeout=45s || true &
WAIT_PID=$!
# uninstall the chart
helm uninstall ${COLLECTOR_RELEASE_NAME}
# wait for the pods to be deleted
wait $WAIT_PID || true
# kill the logs
kill -TERM $LOGS_PID || true
- name: (collector) Get Collector Parquet File
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
run: |
# Get UUID prefix from the output
UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-"
# List files with the UUID prefix
echo "Checking for files with prefix ${UUID_PREFIX} in S3 bucket ${S3_BUCKET}"
S3_FILES=$(aws s3 ls "s3://${S3_BUCKET}/${UUID_PREFIX}" --recursive || echo "")
if [ -z "$S3_FILES" ]; then
echo "No files found with prefix ${UUID_PREFIX} in bucket ${S3_BUCKET}"
exit 1
else
echo "Found files with prefix ${UUID_PREFIX}:"
echo "$S3_FILES"
# Get the file path
PARQUET_FILE=$(echo "$S3_FILES" | head -n 1 | awk '{print $4}')
# Download the parquet file for validation
aws s3 cp "s3://${S3_BUCKET}/${PARQUET_FILE}" /tmp/collector-parquet.parquet
# Check file size
FILE_SIZE=$(stat -c %s /tmp/collector-parquet.parquet)
echo "Downloaded collector file size: ${FILE_SIZE} bytes"
fi
- name: (collector) Upload Collector Results as Artifacts
uses: actions/upload-artifact@v4
with:
name: collector-parquet-results
path: /tmp/collector-parquet.parquet
retention-days: 28
- name: (trace) Upload Trace Results as Artifacts
uses: actions/upload-artifact@v4
with:
name: trace-parquet-results
path: /tmp/trace_data/
retention-days: 28
if: always()
- name: (status) Print Events
run: |
kubectl get events
- name: (status) Print Pod Status
run: |
kubectl get pods -n default
- name: (hpa) Check HPA Status At End of Benchmark
run: |
echo "Checking HPA status at the end of the benchmark run..."
kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)"
echo "Final deployment scaling status..."
kubectl get deployment -o wide
echo "Pod top..."
kubectl top pods --all-namespaces || echo "No pod metrics available"
- name: (opentelemetry-collector) Collect Internal Metrics File
run: |
# Create directory for internal metrics
mkdir -p otel_internal_metrics
# Get the OpenTelemetry collector pod name
OTEL_POD=$(kubectl get pods -l app.kubernetes.io/name=otel-collector -o name | cut -d/ -f2)
echo "OpenTelemetry Collector Pod: $OTEL_POD"
# Check if the internal metrics file exists and copy it
echo "Checking for internal metrics file..."
if kubectl exec $OTEL_POD -- test -f /tmp/internal-metrics/otel-collector-internal-metrics.jsonl; then
echo "Internal metrics file found, copying..."
kubectl cp $OTEL_POD:/tmp/internal-metrics/otel-collector-internal-metrics.jsonl ./otel_internal_metrics/otel-collector-internal-metrics.jsonl
# Check file size and show first few lines
if [ -f ./otel_internal_metrics/otel-collector-internal-metrics.jsonl ]; then
FILE_SIZE=$(stat -c %s ./otel_internal_metrics/otel-collector-internal-metrics.jsonl)
echo "Internal metrics file size: ${FILE_SIZE} bytes"
echo "First 5 lines of internal metrics file:"
head -n 5 ./otel_internal_metrics/otel-collector-internal-metrics.jsonl || true
else
echo "Failed to copy internal metrics file"
fi
else
echo "Internal metrics file not found in pod"
kubectl exec $OTEL_POD -- ls -la /tmp/internal-metrics/ || echo "Directory /tmp/internal-metrics/ not found"
fi
# List the extracted files
echo "Internal metrics files:"
ls -la otel_internal_metrics/ || echo "No internal metrics files found"
- name: (opentelemetry-collector) Upload Internal Metrics as Artifacts
uses: actions/upload-artifact@v4
with:
name: otel-internal-metrics
path: otel_internal_metrics/
retention-days: 28
if: always()
generate-flamegraphs:
name: Generate Flamegraphs
needs: [setup-runner, run-workload]
runs-on: ${{ needs.setup-runner.outputs.runner-label }}
timeout-minutes: 10
env:
HOME: /root
steps:
- name: Create HOME directory
run: |
mkdir -p $HOME
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: (install) Install build tools
uses: awalsh128/cache-apt-pkgs-action@v1
with:
packages: build-essential
version: 1.0
- name: (install) Install Rust and addr2line
run: |
# Install Rust
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
# Verify Rust installation
rustc --version
cargo --version
# Install addr2line from crates.io
cargo install addr2line --features="bin"
# Check if the system addr2line exists and back it up if it does
if [ -f /usr/bin/addr2line ]; then
sudo mv /usr/bin/addr2line /usr/bin/addr2line.bak
echo "Backed up original addr2line to /usr/bin/addr2line.bak"
fi
# Copy the Rust addr2line to replace the system version
sudo cp "$HOME/.cargo/bin/addr2line" /usr/bin/addr2line
sudo chmod +x /usr/bin/addr2line
# Verify the replacement
addr2line --version
echo "Replaced system addr2line with Rust implementation"
# Install Inferno for flamegraph generation
cargo install inferno
# Verify Inferno installation
inferno-flamegraph --help
- name: (metrics) Generate Perf Report and Flamegraphs
run: |
# so we can use the inferno binaries
source "$HOME/.cargo/env"
mkdir -p flamegraph_results perf_results
# Generate a report summary in text format
echo "Generating perf report - this may take some time..."
perf report --stdio -i /tmp/perf_data/perf.data > perf_results/perf_report.txt || true
# Generate collapsed stack traces (folded)
echo "Generating folded stacks from perf data..."
perf script -i /tmp/perf_data/perf.data | inferno-collapse-perf > flamegraph_results/stacks.folded
# Generate flamegraphs with different color schemes
echo "Generating flamegraphs from folded stacks..."
cat flamegraph_results/stacks.folded | inferno-flamegraph > flamegraph_results/flamegraph.svg
# List the perf and flamegraph files
echo "Perf results:"
ls -la perf_results/
echo "Flamegraph results:"
ls -la flamegraph_results/
- name: (metrics) Upload Perf and Flamegraph Results as Artifacts
uses: actions/upload-artifact@v4
with:
name: performance-analysis
path: |
perf_results/
flamegraph_results/
retention-days: 28
generate-visualizations:
name: Generate Performance Visualizations
needs: [run-workload]
runs-on: ubuntu-latest
env:
STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes || '1' }}
TIME_DIFF: ${{ needs.run-workload.outputs.time-diff || '0' }}
container:
image: rocker/tidyverse:latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Install R Dependencies
run: |
# Install only the additional packages needed (tidyverse is already included in base image)
R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')"
- name: Download System Metrics Artifacts
uses: actions/download-artifact@v4
with:
name: system-metrics-results
path: system_metrics
- name: Download Locust Results
uses: actions/download-artifact@v4
with:
name: locust-csv-results
path: locust_results
- name: Download Collector Parquet
uses: actions/download-artifact@v4
with:
name: collector-parquet-results
path: collector_data
- name: List Downloaded Files
run: |
echo "System Metrics Files:"
ls -la system_metrics/
echo "Locust Results:"
ls -la locust_results/
echo "Collector Data:"
ls -la collector_data/
- name: Convert CPU Metrics Format
run: |
# Create directory for converted metrics
mkdir -p converted_metrics
# Convert CPU metrics
bash scripts/convert_cpu_metrics.sh system_metrics/cpu_metrics.csv converted_metrics/cpu_metrics.csv
# Check output
echo "Converted CPU Metrics:"
head -n 5 converted_metrics/cpu_metrics.csv
- name: Generate Memory Utilization Plots
run: |
mkdir -p visualization_results
# Generate memory utilization plots for collector process
Rscript scripts/plot_memory_utilization.R system_metrics/memory_metrics.csv collector visualization_results/memory_utilization
# Check output
ls -la visualization_results/
- name: Generate CPU Utilization Plots
run: |
# Generate CPU utilization plots
Rscript scripts/plot_cpu_utilization.R converted_metrics/cpu_metrics.csv collector visualization_results/cpu_utilization
# Check output
ls -la visualization_results/
- name: Generate Workload Performance Plots
run: |
# Find the stats_history file
STATS_FILE=$(find locust_results -name "*stats_history.csv" | head -1)
if [ -n "$STATS_FILE" ]; then
echo "Using Locust stats file: $STATS_FILE"
# Generate workload performance plots
Rscript scripts/plot_workload_performance.R $STATS_FILE visualization_results/workload_performance
else
echo "No stats_history.csv file found in locust_results directory"
ls -la locust_results/
exit 1
fi
# Check output
ls -la visualization_results/
- name: Generate CPI by LLC Misses Plots
run: |
# Calculate the time window based on steady state minutes and the time difference between
# when collector is ready and when load generator is ready
TIME_DIFF=${TIME_DIFF:-0}
# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
# End: End 5 seconds before the steady state ends
START_TIME=$((205 + TIME_DIFF))
STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))
# Calculate an alternative end time capped at 180 seconds from start
END_TIME_CAPPED=$((START_TIME + 180))
# Use the earlier of the two end times for the main plot
if [ "$END_TIME_CAPPED" -lt "$END_TIME" ]; then
echo "Using capped end time (180s from start): $START_TIME - $END_TIME_CAPPED seconds (steady state: ${STEADY_STATE_MINUTES} min)"
echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"
# Generate an additional plot with the original end time
Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses_capped 23 $START_TIME $END_TIME_CAPPED
else
echo "No need to cap the length: using steady state end time"
fi
echo "Using time window: $START_TIME - $END_TIME seconds (steady state: ${STEADY_STATE_MINUTES} min)"
echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"
# Generate CPI by LLC misses plots with time window parameters
Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses 23 $START_TIME $END_TIME || true
- name: Generate Contention Analysis Plots
run: |
# Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds
TIME_DIFF=${TIME_DIFF:-0}
# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
# End: End 5 seconds before the steady state ends
START_TIME=$((205 + TIME_DIFF))
STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))
# Cap the window duration at 60 seconds
WINDOW_DURATION=$((END_TIME - START_TIME))
if [ "$WINDOW_DURATION" -gt 60 ]; then
WINDOW_DURATION=60
END_TIME=$((START_TIME + 60))
fi
echo "Using time window for contention analysis: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)"
echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"
# Generate contention analysis plots with calculated time window
Rscript scripts/plot_contention_analysis.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/contention_analysis 18 0.2 $END_TIME || true
# Check output
ls -la visualization_results/contention_analysis_* || echo "No contention analysis plots generated"
- name: Generate Instructions vs CPI Plots
run: |
# Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds
TIME_DIFF=${TIME_DIFF:-0}
# Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready)
# End: End 5 seconds before the steady state ends
START_TIME=$((205 + TIME_DIFF))
STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60))
END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF))
# Cap the window duration at 60 seconds
WINDOW_DURATION=$((END_TIME - START_TIME))
if [ "$WINDOW_DURATION" -gt 60 ]; then
WINDOW_DURATION=60
END_TIME=$((START_TIME + 60))
fi
echo "Using time window for instructions vs CPI: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)"
echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"
# Generate Instructions vs CPI scatter plot with calculated time window
Rscript scripts/plot_instructions_vs_cpi.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/instructions_vs_cpi_steady_state 18 $END_TIME || true
# Check output
ls -la visualization_results/instructions_vs_cpi_* || echo "No instructions vs CPI plots generated"
- name: Upload Visualization Results
uses: actions/upload-artifact@v4
with:
name: performance-visualizations
path: visualization_results/
retention-days: 28
analyze-parquet:
name: Analyze Parquet Data
needs: [run-workload]
runs-on: ubuntu-latest
strategy:
matrix:
data-type: [timeslot, trace]
steps:
- name: Download parquet artifact
uses: actions/download-artifact@v4
with:
name: ${{ matrix.data-type == 'timeslot' && 'collector-parquet-results' || 'trace-parquet-results' }}
path: parquet-data
- name: Install pqrs
run: |
curl -L -o pqrs.zip https://github.com/manojkarthick/pqrs/releases/download/v0.3.2/pqrs-0.3.2-x86_64-unknown-linux-gnu.zip
python3 -m zipfile -e pqrs.zip .
sudo mv pqrs-0.3.2-x86_64-unknown-linux-gnu/bin/pqrs /usr/local/bin/
sudo chmod +x /usr/local/bin/pqrs
rm -rf pqrs.zip pqrs-0.3.2-x86_64-unknown-linux-gnu
pqrs --version
- name: Analyze parquet file
run: |
mkdir -p parquet-analysis
PARQUET_FILE=$(find parquet-data -name "*.parquet" -type f | head -1)
echo "Found parquet file: $PARQUET_FILE"
if [ -z "$PARQUET_FILE" ]; then
echo "No parquet files found for ${{ matrix.data-type }} data"
exit 0
fi
# Generate simple schema (non-detailed)
echo "Generating simple schema..."
pqrs schema $PARQUET_FILE > parquet-analysis/schema.txt
# Generate detailed schema
echo "Generating detailed schema..."
pqrs schema --detailed $PARQUET_FILE > parquet-analysis/schema-detailed.txt
# Generate sample records (100)
echo "Generating 100 sample records..."
pqrs sample --records 100 $PARQUET_FILE > parquet-analysis/sample-100.txt
# Generate head records (first 100)
echo "Generating first 100 records..."
pqrs head --records 100 $PARQUET_FILE > parquet-analysis/head-100.txt
# Generate JSON versions too
echo "Generating JSON formatted outputs..."
pqrs schema --json $PARQUET_FILE > parquet-analysis/schema.json
pqrs sample --json --records 100 $PARQUET_FILE > parquet-analysis/sample-100.json
pqrs head --json --records 100 $PARQUET_FILE > parquet-analysis/head-100.json
# List the generated files
echo "Generated analysis files:"
ls -la parquet-analysis/
- name: Upload parquet analysis results
uses: actions/upload-artifact@v4
with:
name: parquet-analysis-${{ matrix.data-type }}-results
path: parquet-analysis/
retention-days: 28
generate-memory-usage-plots:
name: Generate Memory Usage Plots
needs: [run-workload]
runs-on: ubuntu-latest
env:
TIME_DIFF: ${{ needs.run-workload.outputs.time-diff || '0' }}
container:
image: rocker/tidyverse:latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Install R Dependencies
run: |
# Install only the additional packages needed (tidyverse is already included in base image)
R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')"
- name: Download Collector Parquet
uses: actions/download-artifact@v4
with:
name: collector-parquet-results
path: collector_data
- name: Generate Memory Usage Plots
run: |
mkdir -p memory_usage_results
# The collector is ready TIME_DIFF seconds before the load generator is ready
TIME_DIFF=${TIME_DIFF:-0}
# Calculate adjusted start times relative to when collector is ready
START_TIME_1=$((20 + TIME_DIFF))
START_TIME_2=$((180 + TIME_DIFF))
echo "Using adjusted start times: $START_TIME_1 and $START_TIME_2 seconds after collector is ready"
echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds"
# Generate memory usage plots (LLC misses and cache references) with the adjusted start times
for i in 0.3 0.5 1 3 10; do
Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_1 $i memory_usage_results/memory_usage_20sec_$i || true
Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_2 $i memory_usage_results/memory_usage_180sec_$i || true
done
# Check output
ls -la memory_usage_results/
- name: Upload Memory Usage Plots
uses: actions/upload-artifact@v4
with:
name: memory-usage-plots
path: memory_usage_results/
retention-days: 28
stop-runner:
name: Stop EC2 runner
needs: [setup-runner, run-workload, generate-flamegraphs] # Now depends on the flamegraph job
runs-on: ubuntu-latest
if: always() # Run even if previous jobs fail
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Stop AWS Runner
uses: ./.github/actions/aws-runner/cleanup
with:
runner-label: ${{ needs.setup-runner.outputs.runner-label }}
ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }}
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
aws-role-arn: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ needs.setup-runner.outputs.region || secrets.AWS_REGION }}