feat(nri-resctrl-plugin): add integration tests #38
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: benchmark | |
| on: | |
| workflow_dispatch: # Manual trigger for testing | |
| inputs: | |
| instance-type: | |
| description: 'EC2 instance type to use' | |
| required: false | |
| default: 'm7i.metal-24xl' | |
| type: string | |
| image-type: | |
| description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)' | |
| required: false | |
| default: 'ubuntu-22.04' | |
| type: string | |
| pidstat-period: | |
| description: 'Collection frequency for pidstat in seconds' | |
| required: false | |
| default: '1' | |
| type: string | |
| steady-state-minutes: | |
| description: 'How long to run the steady-state workload in minutes' | |
| required: false | |
| default: '1' | |
| type: string | |
| trace-duration: | |
| description: 'Duration for trace collection in seconds' | |
| required: false | |
| default: '10' | |
| type: string | |
| collector-repository: | |
| description: 'Collector image repository to use' | |
| required: false | |
| default: 'ghcr.io/unvariance/collector/collector' | |
| type: string | |
| pull_request: | |
| paths: | |
| - deploy/opentelemetry-demo/values.yaml | |
| - .github/workflows/benchmark.yaml | |
| - '.github/actions/setup-k3s/**' | |
| push: | |
| branches: | |
| - main | |
| paths: | |
| - deploy/opentelemetry-demo/values.yaml | |
| - .github/workflows/benchmark.yaml | |
| - '.github/actions/setup-k3s/**' | |
| permissions: | |
| id-token: write # Required for requesting the JWT | |
| contents: read | |
| actions: write | |
| jobs: | |
| setup-runner: | |
| name: Start EC2 runner | |
| runs-on: ubuntu-latest | |
| outputs: | |
| runner-label: ${{ steps.start-runner.outputs.runner-label }} | |
| ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }} | |
| region: ${{ steps.start-runner.outputs.region }} | |
| timeout-minutes: ${{ steps.calculate-timeout.outputs.timeout-minutes }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Calculate timeout | |
| id: calculate-timeout | |
| run: | | |
| STEADY_STATE_MINUTES=${{ inputs.steady-state-minutes || '1' }} | |
| TIMEOUT_MINUTES=$((15 + STEADY_STATE_MINUTES)) | |
| echo "timeout-minutes=$TIMEOUT_MINUTES" >> $GITHUB_OUTPUT | |
| echo "Calculated timeout: $TIMEOUT_MINUTES minutes" | |
| - name: Start AWS Runner | |
| id: start-runner | |
| uses: ./.github/actions/aws-runner | |
| with: | |
| github-token: ${{ secrets.REPO_ADMIN_TOKEN }} | |
| aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} | |
| iam-role-name: github-actions-runner | |
| instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} | |
| image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} | |
| volume-size: '40' | |
| run-workload: | |
| needs: [setup-runner] | |
| runs-on: ${{ needs.setup-runner.outputs.runner-label }} | |
| timeout-minutes: ${{ fromJSON(needs.setup-runner.outputs.timeout-minutes) }} | |
| outputs: | |
| uuid-prefix: ${{ steps.generate-uuid.outputs.uuid }} | |
| time-diff: ${{ steps.calculate-time-diff.outputs.time-diff }} | |
| env: | |
| RELEASE_NAME: otel-demo | |
| COLLECTOR_RELEASE_NAME: collector | |
| S3_BUCKET: "unvariance-collector-test-key-auth" | |
| AWS_REGION: ${{ secrets.AWS_REGION }} | |
| HOME: /root | |
| KUBECONFIG: /etc/rancher/k3s/k3s.yaml | |
| PIDSTAT_PERIOD: ${{ inputs.pidstat-period || '1' }} | |
| STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes || '1' }} | |
| steps: | |
| - name: Create HOME directory | |
| run: | | |
| mkdir -p $HOME | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: (k8s) Setup k3s cluster | |
| uses: ./.github/actions/setup-k3s | |
| with: | |
| kubeconfig_path: /etc/rancher/k3s/k3s.yaml | |
| preflight_inotify: true | |
| kubelet_max_pods: 400 | |
| disable_packaged_addons: true | |
| wait_kube_system: true | |
| timeout_api_server_ready_seconds: 300 | |
| timeout_node_ready_seconds: 300 | |
| timeout_kube_system_each_seconds: 10 | |
| max_retries_kube_system_ready: 10 | |
| - name: (k8s) Install Helm | |
| run: | | |
| curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash | |
| - name: (k8s) Add Helm Repositories | |
| run: | | |
| helm repo add unvariance https://unvariance.github.io/collector/charts | |
| helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts | |
| helm repo update | |
| - name: (k8s) Clone forked OpenTelemetry Helm Charts | |
| run: | | |
| git clone https://github.com/yonch/opentelemetry-helm-charts.git | |
| cd opentelemetry-helm-charts | |
| echo "Using forked OpenTelemetry Helm Charts repository" | |
| git log --oneline -5 | |
| - name: (k8s) Update OpenTelemetry Demo Dependencies | |
| run: | | |
| cd opentelemetry-helm-charts/charts/opentelemetry-demo | |
| echo "Updating helm dependencies..." | |
| helm dep update | |
| #### Install KEDA Operator | |
| - name: (k8s) Install KEDA Operator | |
| run: | | |
| helm repo add kedacore https://kedacore.github.io/charts | |
| helm repo update | |
| helm install keda kedacore/keda --namespace keda-system --create-namespace | |
| #### Install OpenTelemetry Demo | |
| - name: (workload) Install OpenTelemetry Demo | |
| run: | | |
| helm install $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml | |
| #### Install dependencies | |
| - name: (install) Compute kernel package names | |
| id: kernel | |
| run: | | |
| echo "tools=linux-tools-$(uname -r)" >> $GITHUB_OUTPUT | |
| - name: (install) Install dependencies using apt | |
| uses: awalsh128/cache-apt-pkgs-action@v1 | |
| with: | |
| packages: sysstat linux-tools-common linux-tools-generic bzip2 podman ${{ steps.kernel.outputs.tools }} | |
| version: 1.0 | |
| - name: (install) Verify perf and podman | |
| run: | | |
| perf_version=$(perf --version 2>&1) | |
| echo "Installed perf: $perf_version" | |
| podman_version=$(podman --version 2>&1) | |
| echo "Installed podman: $podman_version" | |
| - name: (install) Install awscli | |
| run: | | |
| curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" | |
| python3 -m zipfile -e awscliv2.zip . | |
| chmod u+x ./aws/install | |
| sudo ./aws/install | |
| echo ls: `ls -l /usr/local/bin/aws` || true | |
| chmod +x /usr/local/bin/aws || true | |
| echo version: `/usr/local/bin/aws --version` || true | |
| - name: (collector) Pre-fetch Collector Image | |
| run: | | |
| COLLECTOR_IMAGE="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}" | |
| echo "Pre-fetching collector image: ${COLLECTOR_IMAGE}" | |
| podman pull "${COLLECTOR_IMAGE}" || true | |
| echo "Pre-fetch completed" | |
| #### Wait for services to be ready | |
| - name: (stabilize) Wait for all Pods to be Ready | |
| run: | | |
| if ! kubectl wait --for=condition=Ready pods --all --timeout=300s; then | |
| echo "Error: Not all pods reached Ready state within timeout" | |
| kubectl get pods | |
| echo "--------------------------------" | |
| kubectl describe pods | |
| exit 1 | |
| fi | |
| #### Show system status | |
| - name: (status) Get Default objects in kube-system | |
| run: | | |
| kubectl get all -n kube-system | |
| - name: (status) Print Events | |
| run: | | |
| kubectl get events | |
| - name: (status) Disk Space Size | |
| run: | | |
| df -h | |
| - name: (status) Print Pod Status | |
| run: | | |
| kubectl get pods -n default | |
| - name: (status) Describe Frontend Deployment | |
| run: | | |
| kubectl describe deployment frontend | |
| #### Deploy Collector | |
| - name: (collector) Generate UUID Prefix | |
| id: generate-uuid | |
| run: | | |
| UUID=$(python3 -c "import uuid; print(uuid.uuid4())") | |
| echo "Using UUID prefix: $UUID" | |
| echo "uuid=$UUID" >> $GITHUB_OUTPUT | |
| - name: (collector) Deploy Collector Helm Chart | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} | |
| run: | | |
| UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-" | |
| # Install the helm chart using --set for values | |
| helm install ${COLLECTOR_RELEASE_NAME} unvariance/collector \ | |
| --set collector.verbose=true \ | |
| --set image.repository="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}" \ | |
| --set storage.type=s3 \ | |
| --set storage.prefix="${UUID_PREFIX}" \ | |
| --set storage.s3.bucket="${S3_BUCKET}" \ | |
| --set storage.s3.region="${AWS_REGION}" \ | |
| --set storage.s3.auth.method=secret \ | |
| --set storage.s3.auth.accessKey="${AWS_ACCESS_KEY_ID}" \ | |
| --set storage.s3.auth.secretKey="${AWS_SECRET_ACCESS_KEY}" \ | |
| --set resources.limits.cpu=1000m \ | |
| --set resources.requests.cpu=1000m \ | |
| --set resources.limits.memory=1000Mi \ | |
| --wait | |
| - name: (collector) Wait for Collector Pods to be Ready | |
| run: | | |
| kubectl wait --for=condition=Ready pods --timeout=60s -l app.kubernetes.io/name=collector || WAIT_STATUS=$? | |
| echo "Wait exit status: $WAIT_STATUS" | |
| # Record timestamp when collector is fully ready | |
| COLLECTOR_START_TIME=$(date +%s) | |
| # Always describe pods, regardless of wait result | |
| echo "Describing collector pods:" | |
| kubectl describe pods -l app.kubernetes.io/name=collector | |
| # Only fail the job if wait failed | |
| if [ -n "$WAIT_STATUS" ] && [ "$WAIT_STATUS" -ne 0 ]; then | |
| echo "ERROR: Collector pods are not ready after timeout" | |
| exit 1 | |
| fi | |
| # Output the timestamp when collector is fully ready | |
| echo "COLLECTOR_START_TIME=${COLLECTOR_START_TIME}" >> $GITHUB_ENV | |
| echo "Collector fully ready at timestamp: ${COLLECTOR_START_TIME}" | |
| - name: (hpa) Check HPA Status After Deployment | |
| run: | | |
| echo "Checking HPA status after deployment but before load generation starts..." | |
| kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)" | |
| echo "Checking pod scaling status..." | |
| kubectl get deployment -o wide | |
| echo "Pod top..." | |
| kubectl top pods --all-namespaces || echo "No pod metrics available" | |
| - name: (collector) Show Collector Pod Status | |
| run: | | |
| kubectl get pods -l app.kubernetes.io/name=collector | |
| kubectl logs -l app.kubernetes.io/name=collector -c collector --tail=10 | |
| #### Start load generator | |
| - name: (workload) Start Load Generator | |
| id: calculate-time-diff | |
| run: | | |
| helm upgrade $RELEASE_NAME opentelemetry-helm-charts/charts/opentelemetry-demo -f deploy/opentelemetry-demo/values.yaml \ | |
| --set components.load-generator.enabled=true | |
| # Wait for load generator pods to be specifically in Ready state | |
| echo "Waiting for load generator pods to be ready..." | |
| kubectl wait --for=condition=Ready pods -l app.kubernetes.io/component=load-generator --timeout=60s || true | |
| # Record timestamp when load generator is fully ready | |
| LOAD_GEN_START_TIME=$(date +%s) | |
| # Calculate the time difference between collector and load generator start | |
| TIME_DIFF=$((LOAD_GEN_START_TIME - COLLECTOR_START_TIME)) | |
| echo "time-diff=${TIME_DIFF}" >> $GITHUB_OUTPUT | |
| echo "TIME_DIFF=${TIME_DIFF}" >> $GITHUB_ENV | |
| echo "Load generator fully ready at timestamp: ${LOAD_GEN_START_TIME}" | |
| echo "Time difference between collector and load generator ready: ${TIME_DIFF} seconds" | |
| - name: (workload) Describe Load Generator Deployment | |
| run: | | |
| kubectl describe deployment load-generator | |
| #### Start PIDs stat collection | |
| - name: (metrics) Start PIDs stat collection | |
| id: start-pidstat | |
| run: | | |
| # Create directory for metrics | |
| mkdir -p /tmp/system_metrics | |
| # Set file paths | |
| CPU_METRICS_FILE="/tmp/system_metrics/cpu_metrics.csv" | |
| MEMORY_METRICS_FILE="/tmp/system_metrics/memory_metrics.csv" | |
| # Start CPU metrics collection in background | |
| pidstat -H -u -p ALL ${PIDSTAT_PERIOD} | awk '{gsub(/^ +| +$/,""); gsub(/ +/,";"); print}' > ${CPU_METRICS_FILE} 2>&1 & | |
| CPU_PIDSTAT_PID=$! | |
| # Start memory metrics collection in background | |
| pidstat -H -r -p ALL ${PIDSTAT_PERIOD} | awk '{gsub(/^ +| +$/,""); gsub(/ +/,";"); print}' > ${MEMORY_METRICS_FILE} 2>&1 & | |
| MEMORY_PIDSTAT_PID=$! | |
| # Verify processes are running | |
| ps -p $CPU_PIDSTAT_PID | |
| ps -p $MEMORY_PIDSTAT_PID | |
| # Set environment variables for later use | |
| echo "CPU_METRICS_FILE=${CPU_METRICS_FILE}" >> $GITHUB_ENV | |
| echo "MEMORY_METRICS_FILE=${MEMORY_METRICS_FILE}" >> $GITHUB_ENV | |
| echo "CPU_PIDSTAT_PID=${CPU_PIDSTAT_PID}" >> $GITHUB_ENV | |
| echo "MEMORY_PIDSTAT_PID=${MEMORY_PIDSTAT_PID}" >> $GITHUB_ENV | |
| echo "Started PIDs stat collection:" | |
| echo "CPU metrics PID: ${CPU_PIDSTAT_PID}" | |
| echo "Memory metrics PID: ${MEMORY_PIDSTAT_PID}" | |
| echo "CPU metrics file: ${CPU_METRICS_FILE}" | |
| echo "Memory metrics file: ${MEMORY_METRICS_FILE}" | |
| #### Main run time | |
| - name: (metrics) Ramp up load generator | |
| run: | | |
| echo "Ramping up load generator without perf record..." | |
| sleep 200 | |
| - name: (hpa) Check HPA Status Before Measurements | |
| run: | | |
| echo "Checking HPA status just before measurements with perf..." | |
| kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)" | |
| echo "Checking pod scaling status..." | |
| kubectl get deployment -o wide | |
| echo "Pod top..." | |
| kubectl top pods --all-namespaces || echo "No pod metrics available" | |
| - name: (trace) Collect Trace Data | |
| run: | | |
| # Create directory for trace output | |
| mkdir -p /tmp/trace_data | |
| TRACE_DURATION="${{ inputs.trace-duration || '10' }}" | |
| COLLECTOR_IMAGE="${{ inputs.collector-repository || 'ghcr.io/unvariance/collector/collector' }}" | |
| echo "Starting trace collection for ${TRACE_DURATION} seconds..." | |
| # Run collector in trace mode using podman with sudo for BPF access | |
| sudo podman run --rm \ | |
| --privileged \ | |
| --network host \ | |
| --pid host \ | |
| --volume /tmp/trace_data:/tmp/trace_data \ | |
| --volume /sys:/sys \ | |
| --volume /proc:/proc:ro \ | |
| --volume /dev:/dev:ro \ | |
| --user 0 \ | |
| "${COLLECTOR_IMAGE}" \ | |
| --trace \ | |
| --prefix "/tmp/trace_data/trace-" \ | |
| --duration "${TRACE_DURATION}" \ | |
| --parquet-file-size 4000000000 \ | |
| --storage-type local | |
| echo "Trace collection completed" | |
| # List trace files | |
| echo "Generated trace files:" | |
| ls -la /tmp/trace_data/ | |
| - name: (metrics) Record system performance with perf | |
| run: | | |
| # Create directory for perf data | |
| mkdir -p /tmp/perf_data | |
| STEADY_STATE_MINUTES=${STEADY_STATE_MINUTES} | |
| PERF_RECORD_SECONDS=60 | |
| # Calculate the sleep duration (all minutes except the last one) | |
| if [[ $STEADY_STATE_MINUTES -gt 1 ]]; then | |
| SLEEP_MINUTES=$((STEADY_STATE_MINUTES - 1)) | |
| SLEEP_SECONDS=$((SLEEP_MINUTES * 60)) | |
| echo "Sleeping for $SLEEP_MINUTES minutes ($SLEEP_SECONDS seconds) before perf recording..." | |
| sleep $SLEEP_SECONDS | |
| fi | |
| echo "Starting perf record for $PERF_RECORD_SECONDS seconds at 19Hz frequency..." | |
| perf record -a -g -F 19 --call-graph dwarf -o /tmp/perf_data/perf.data sleep $PERF_RECORD_SECONDS | |
| # Create directory for perf results | |
| mkdir -p perf_results | |
| # List the perf files | |
| echo "Perf data files:" | |
| ls -la /tmp/perf_data/ | |
| - name: (metrics) Stop PIDs stat collection | |
| run: | | |
| echo "Stopping PIDs stat collection processes" | |
| kill -TERM $CPU_PIDSTAT_PID || true | |
| kill -TERM $MEMORY_PIDSTAT_PID || true | |
| # Wait a moment to ensure files are fully written | |
| sleep 2 | |
| # Check if files exist and have content | |
| echo "CPU metrics file size: $(stat -c %s $CPU_METRICS_FILE || echo 'file not found')" | |
| echo "Memory metrics file size: $(stat -c %s $MEMORY_METRICS_FILE || echo 'file not found')" | |
| # Create directory for system metrics | |
| mkdir -p system_metrics | |
| # Copy the files to the upload directory | |
| cp $CPU_METRICS_FILE system_metrics/cpu_metrics.csv || echo "Failed to copy CPU metrics file" | |
| cp $MEMORY_METRICS_FILE system_metrics/memory_metrics.csv || echo "Failed to copy memory metrics file" | |
| # List the files | |
| echo "System metrics files:" | |
| ls -la system_metrics/ | |
| - name: (metrics) Extract Locust CSV files | |
| run: | | |
| # Create directory for the results | |
| mkdir -p locust_results | |
| # Get the load-generator pod name | |
| LOADGEN_POD=$(kubectl get pods -l app.kubernetes.io/component=load-generator -o name | cut -d/ -f2) | |
| echo "Load Generator Pod: $LOADGEN_POD" | |
| # Copy CSV files from the pod | |
| echo "Copying CSV files from pod..." | |
| kubectl cp $LOADGEN_POD:/tmp/locust_results/ ./locust_results/ | |
| # List the extracted files | |
| echo "Extracted CSV files:" | |
| ls -la locust_results/ | |
| - name: (metrics) Upload Locust Results as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: locust-csv-results | |
| path: locust_results/ | |
| retention-days: 28 | |
| - name: (metrics) Upload System Metrics as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: system-metrics-results | |
| path: system_metrics/ | |
| retention-days: 28 | |
| - name: (collector) Send SIGINT to Collector | |
| run: | | |
| kubectl exec -n default $(kubectl get pods -l app.kubernetes.io/name=collector -o name | cut -d/ -f2) -- /bin/sh -c "kill -INT 1" | |
| sleep 10 | |
| - name: (collector) Print Collector Logs | |
| run: | | |
| kubectl logs -l app.kubernetes.io/name=collector || true | |
| - name: (collector) Uninstall Collector Helm Chart | |
| run: | | |
| # see the logs while uninstalling | |
| kubectl logs -f --tail=-1 -l app.kubernetes.io/name=collector || true & | |
| LOGS_PID=$! | |
| # wait for the pods to be deleted, in the background | |
| kubectl wait --for=delete pods -l app.kubernetes.io/name=collector --timeout=45s || true & | |
| WAIT_PID=$! | |
| # uninstall the chart | |
| helm uninstall ${COLLECTOR_RELEASE_NAME} | |
| # wait for the pods to be deleted | |
| wait $WAIT_PID || true | |
| # kill the logs | |
| kill -TERM $LOGS_PID || true | |
| - name: (collector) Get Collector Parquet File | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} | |
| run: | | |
| # Get UUID prefix from the output | |
| UUID_PREFIX="${{ steps.generate-uuid.outputs.uuid }}-" | |
| # List files with the UUID prefix | |
| echo "Checking for files with prefix ${UUID_PREFIX} in S3 bucket ${S3_BUCKET}" | |
| S3_FILES=$(aws s3 ls "s3://${S3_BUCKET}/${UUID_PREFIX}" --recursive || echo "") | |
| if [ -z "$S3_FILES" ]; then | |
| echo "No files found with prefix ${UUID_PREFIX} in bucket ${S3_BUCKET}" | |
| exit 1 | |
| else | |
| echo "Found files with prefix ${UUID_PREFIX}:" | |
| echo "$S3_FILES" | |
| # Get the file path | |
| PARQUET_FILE=$(echo "$S3_FILES" | head -n 1 | awk '{print $4}') | |
| # Download the parquet file for validation | |
| aws s3 cp "s3://${S3_BUCKET}/${PARQUET_FILE}" /tmp/collector-parquet.parquet | |
| # Check file size | |
| FILE_SIZE=$(stat -c %s /tmp/collector-parquet.parquet) | |
| echo "Downloaded collector file size: ${FILE_SIZE} bytes" | |
| fi | |
| - name: (collector) Upload Collector Results as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: collector-parquet-results | |
| path: /tmp/collector-parquet.parquet | |
| retention-days: 28 | |
| - name: (trace) Upload Trace Results as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: trace-parquet-results | |
| path: /tmp/trace_data/ | |
| retention-days: 28 | |
| if: always() | |
| - name: (status) Print Events | |
| run: | | |
| kubectl get events | |
| - name: (status) Print Pod Status | |
| run: | | |
| kubectl get pods -n default | |
| - name: (hpa) Check HPA Status At End of Benchmark | |
| run: | | |
| echo "Checking HPA status at the end of the benchmark run..." | |
| kubectl get hpa -o wide --all-namespaces || echo "No HPA resources found (wide output)" | |
| echo "Final deployment scaling status..." | |
| kubectl get deployment -o wide | |
| echo "Pod top..." | |
| kubectl top pods --all-namespaces || echo "No pod metrics available" | |
| - name: (opentelemetry-collector) Collect Internal Metrics File | |
| run: | | |
| # Create directory for internal metrics | |
| mkdir -p otel_internal_metrics | |
| # Get the OpenTelemetry collector pod name | |
| OTEL_POD=$(kubectl get pods -l app.kubernetes.io/name=otel-collector -o name | cut -d/ -f2) | |
| echo "OpenTelemetry Collector Pod: $OTEL_POD" | |
| # Check if the internal metrics file exists and copy it | |
| echo "Checking for internal metrics file..." | |
| if kubectl exec $OTEL_POD -- test -f /tmp/internal-metrics/otel-collector-internal-metrics.jsonl; then | |
| echo "Internal metrics file found, copying..." | |
| kubectl cp $OTEL_POD:/tmp/internal-metrics/otel-collector-internal-metrics.jsonl ./otel_internal_metrics/otel-collector-internal-metrics.jsonl | |
| # Check file size and show first few lines | |
| if [ -f ./otel_internal_metrics/otel-collector-internal-metrics.jsonl ]; then | |
| FILE_SIZE=$(stat -c %s ./otel_internal_metrics/otel-collector-internal-metrics.jsonl) | |
| echo "Internal metrics file size: ${FILE_SIZE} bytes" | |
| echo "First 5 lines of internal metrics file:" | |
| head -n 5 ./otel_internal_metrics/otel-collector-internal-metrics.jsonl || true | |
| else | |
| echo "Failed to copy internal metrics file" | |
| fi | |
| else | |
| echo "Internal metrics file not found in pod" | |
| kubectl exec $OTEL_POD -- ls -la /tmp/internal-metrics/ || echo "Directory /tmp/internal-metrics/ not found" | |
| fi | |
| # List the extracted files | |
| echo "Internal metrics files:" | |
| ls -la otel_internal_metrics/ || echo "No internal metrics files found" | |
| - name: (opentelemetry-collector) Upload Internal Metrics as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: otel-internal-metrics | |
| path: otel_internal_metrics/ | |
| retention-days: 28 | |
| if: always() | |
| generate-flamegraphs: | |
| name: Generate Flamegraphs | |
| needs: [setup-runner, run-workload] | |
| runs-on: ${{ needs.setup-runner.outputs.runner-label }} | |
| timeout-minutes: 10 | |
| env: | |
| HOME: /root | |
| steps: | |
| - name: Create HOME directory | |
| run: | | |
| mkdir -p $HOME | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: (install) Install build tools | |
| uses: awalsh128/cache-apt-pkgs-action@v1 | |
| with: | |
| packages: build-essential | |
| version: 1.0 | |
| - name: (install) Install Rust and addr2line | |
| run: | | |
| # Install Rust | |
| curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y | |
| source "$HOME/.cargo/env" | |
| # Verify Rust installation | |
| rustc --version | |
| cargo --version | |
| # Install addr2line from crates.io | |
| cargo install addr2line --features="bin" | |
| # Check if the system addr2line exists and back it up if it does | |
| if [ -f /usr/bin/addr2line ]; then | |
| sudo mv /usr/bin/addr2line /usr/bin/addr2line.bak | |
| echo "Backed up original addr2line to /usr/bin/addr2line.bak" | |
| fi | |
| # Copy the Rust addr2line to replace the system version | |
| sudo cp "$HOME/.cargo/bin/addr2line" /usr/bin/addr2line | |
| sudo chmod +x /usr/bin/addr2line | |
| # Verify the replacement | |
| addr2line --version | |
| echo "Replaced system addr2line with Rust implementation" | |
| # Install Inferno for flamegraph generation | |
| cargo install inferno | |
| # Verify Inferno installation | |
| inferno-flamegraph --help | |
| - name: (metrics) Generate Perf Report and Flamegraphs | |
| run: | | |
| # so we can use the inferno binaries | |
| source "$HOME/.cargo/env" | |
| mkdir -p flamegraph_results perf_results | |
| # Generate a report summary in text format | |
| echo "Generating perf report - this may take some time..." | |
| perf report --stdio -i /tmp/perf_data/perf.data > perf_results/perf_report.txt || true | |
| # Generate collapsed stack traces (folded) | |
| echo "Generating folded stacks from perf data..." | |
| perf script -i /tmp/perf_data/perf.data | inferno-collapse-perf > flamegraph_results/stacks.folded | |
| # Generate flamegraphs with different color schemes | |
| echo "Generating flamegraphs from folded stacks..." | |
| cat flamegraph_results/stacks.folded | inferno-flamegraph > flamegraph_results/flamegraph.svg | |
| # List the perf and flamegraph files | |
| echo "Perf results:" | |
| ls -la perf_results/ | |
| echo "Flamegraph results:" | |
| ls -la flamegraph_results/ | |
| - name: (metrics) Upload Perf and Flamegraph Results as Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: performance-analysis | |
| path: | | |
| perf_results/ | |
| flamegraph_results/ | |
| retention-days: 28 | |
| generate-visualizations: | |
| name: Generate Performance Visualizations | |
| needs: [run-workload] | |
| runs-on: ubuntu-latest | |
| env: | |
| STEADY_STATE_MINUTES: ${{ inputs.steady-state-minutes || '1' }} | |
| TIME_DIFF: ${{ needs.run-workload.outputs.time-diff || '0' }} | |
| container: | |
| image: rocker/tidyverse:latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Install R Dependencies | |
| run: | | |
| # Install only the additional packages needed (tidyverse is already included in base image) | |
| R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')" | |
| - name: Download System Metrics Artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: system-metrics-results | |
| path: system_metrics | |
| - name: Download Locust Results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: locust-csv-results | |
| path: locust_results | |
| - name: Download Collector Parquet | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: collector-parquet-results | |
| path: collector_data | |
| - name: List Downloaded Files | |
| run: | | |
| echo "System Metrics Files:" | |
| ls -la system_metrics/ | |
| echo "Locust Results:" | |
| ls -la locust_results/ | |
| echo "Collector Data:" | |
| ls -la collector_data/ | |
| - name: Convert CPU Metrics Format | |
| run: | | |
| # Create directory for converted metrics | |
| mkdir -p converted_metrics | |
| # Convert CPU metrics | |
| bash scripts/convert_cpu_metrics.sh system_metrics/cpu_metrics.csv converted_metrics/cpu_metrics.csv | |
| # Check output | |
| echo "Converted CPU Metrics:" | |
| head -n 5 converted_metrics/cpu_metrics.csv | |
| - name: Generate Memory Utilization Plots | |
| run: | | |
| mkdir -p visualization_results | |
| # Generate memory utilization plots for collector process | |
| Rscript scripts/plot_memory_utilization.R system_metrics/memory_metrics.csv collector visualization_results/memory_utilization | |
| # Check output | |
| ls -la visualization_results/ | |
| - name: Generate CPU Utilization Plots | |
| run: | | |
| # Generate CPU utilization plots | |
| Rscript scripts/plot_cpu_utilization.R converted_metrics/cpu_metrics.csv collector visualization_results/cpu_utilization | |
| # Check output | |
| ls -la visualization_results/ | |
| - name: Generate Workload Performance Plots | |
| run: | | |
| # Find the stats_history file | |
| STATS_FILE=$(find locust_results -name "*stats_history.csv" | head -1) | |
| if [ -n "$STATS_FILE" ]; then | |
| echo "Using Locust stats file: $STATS_FILE" | |
| # Generate workload performance plots | |
| Rscript scripts/plot_workload_performance.R $STATS_FILE visualization_results/workload_performance | |
| else | |
| echo "No stats_history.csv file found in locust_results directory" | |
| ls -la locust_results/ | |
| exit 1 | |
| fi | |
| # Check output | |
| ls -la visualization_results/ | |
| - name: Generate CPI by LLC Misses Plots | |
| run: | | |
| # Calculate the time window based on steady state minutes and the time difference between | |
| # when collector is ready and when load generator is ready | |
| TIME_DIFF=${TIME_DIFF:-0} | |
| # Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready) | |
| # End: End 5 seconds before the steady state ends | |
| START_TIME=$((205 + TIME_DIFF)) | |
| STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60)) | |
| END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF)) | |
| # Calculate an alternative end time capped at 180 seconds from start | |
| END_TIME_CAPPED=$((START_TIME + 180)) | |
| # Use the earlier of the two end times for the main plot | |
| if [ "$END_TIME_CAPPED" -lt "$END_TIME" ]; then | |
| echo "Using capped end time (180s from start): $START_TIME - $END_TIME_CAPPED seconds (steady state: ${STEADY_STATE_MINUTES} min)" | |
| echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds" | |
| # Generate an additional plot with the original end time | |
| Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses_capped 23 $START_TIME $END_TIME_CAPPED | |
| else | |
| echo "No need to cap the length: using steady state end time" | |
| fi | |
| echo "Using time window: $START_TIME - $END_TIME seconds (steady state: ${STEADY_STATE_MINUTES} min)" | |
| echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds" | |
| # Generate CPI by LLC misses plots with time window parameters | |
| Rscript scripts/plot_cpi_by_llc_misses.R collector_data/collector-parquet.parquet 100000 visualization_results/cpi_by_llc_misses 23 $START_TIME $END_TIME || true | |
| - name: Generate Contention Analysis Plots | |
| run: | | |
| # Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds | |
| TIME_DIFF=${TIME_DIFF:-0} | |
| # Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready) | |
| # End: End 5 seconds before the steady state ends | |
| START_TIME=$((205 + TIME_DIFF)) | |
| STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60)) | |
| END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF)) | |
| # Cap the window duration at 60 seconds | |
| WINDOW_DURATION=$((END_TIME - START_TIME)) | |
| if [ "$WINDOW_DURATION" -gt 60 ]; then | |
| WINDOW_DURATION=60 | |
| END_TIME=$((START_TIME + 60)) | |
| fi | |
| echo "Using time window for contention analysis: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)" | |
| echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds" | |
| # Generate contention analysis plots with calculated time window | |
| Rscript scripts/plot_contention_analysis.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/contention_analysis 18 0.2 $END_TIME || true | |
| # Check output | |
| ls -la visualization_results/contention_analysis_* || echo "No contention analysis plots generated" | |
| - name: Generate Instructions vs CPI Plots | |
| run: | | |
| # Calculate the time window similar to CPI by LLC Misses plots but capped at 60 seconds | |
| TIME_DIFF=${TIME_DIFF:-0} | |
| # Start: 200 seconds after load generator is ready (which is 200+TIME_DIFF seconds after collector is ready) | |
| # End: End 5 seconds before the steady state ends | |
| START_TIME=$((205 + TIME_DIFF)) | |
| STEADY_STATE_SECONDS=$((${STEADY_STATE_MINUTES} * 60)) | |
| END_TIME=$((200 + STEADY_STATE_SECONDS - 5 + TIME_DIFF)) | |
| # Cap the window duration at 60 seconds | |
| WINDOW_DURATION=$((END_TIME - START_TIME)) | |
| if [ "$WINDOW_DURATION" -gt 60 ]; then | |
| WINDOW_DURATION=60 | |
| END_TIME=$((START_TIME + 60)) | |
| fi | |
| echo "Using time window for instructions vs CPI: $START_TIME - $END_TIME seconds (${WINDOW_DURATION}s duration, steady state: ${STEADY_STATE_MINUTES} min)" | |
| echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds" | |
| # Generate Instructions vs CPI scatter plot with calculated time window | |
| Rscript scripts/plot_instructions_vs_cpi.R collector_data/collector-parquet.parquet $WINDOW_DURATION visualization_results/instructions_vs_cpi_steady_state 18 $END_TIME || true | |
| # Check output | |
| ls -la visualization_results/instructions_vs_cpi_* || echo "No instructions vs CPI plots generated" | |
| - name: Upload Visualization Results | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: performance-visualizations | |
| path: visualization_results/ | |
| retention-days: 28 | |
| analyze-parquet: | |
| name: Analyze Parquet Data | |
| needs: [run-workload] | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| data-type: [timeslot, trace] | |
| steps: | |
| - name: Download parquet artifact | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: ${{ matrix.data-type == 'timeslot' && 'collector-parquet-results' || 'trace-parquet-results' }} | |
| path: parquet-data | |
| - name: Install pqrs | |
| run: | | |
| curl -L -o pqrs.zip https://github.com/manojkarthick/pqrs/releases/download/v0.3.2/pqrs-0.3.2-x86_64-unknown-linux-gnu.zip | |
| python3 -m zipfile -e pqrs.zip . | |
| sudo mv pqrs-0.3.2-x86_64-unknown-linux-gnu/bin/pqrs /usr/local/bin/ | |
| sudo chmod +x /usr/local/bin/pqrs | |
| rm -rf pqrs.zip pqrs-0.3.2-x86_64-unknown-linux-gnu | |
| pqrs --version | |
| - name: Analyze parquet file | |
| run: | | |
| mkdir -p parquet-analysis | |
| PARQUET_FILE=$(find parquet-data -name "*.parquet" -type f | head -1) | |
| echo "Found parquet file: $PARQUET_FILE" | |
| if [ -z "$PARQUET_FILE" ]; then | |
| echo "No parquet files found for ${{ matrix.data-type }} data" | |
| exit 0 | |
| fi | |
| # Generate simple schema (non-detailed) | |
| echo "Generating simple schema..." | |
| pqrs schema $PARQUET_FILE > parquet-analysis/schema.txt | |
| # Generate detailed schema | |
| echo "Generating detailed schema..." | |
| pqrs schema --detailed $PARQUET_FILE > parquet-analysis/schema-detailed.txt | |
| # Generate sample records (100) | |
| echo "Generating 100 sample records..." | |
| pqrs sample --records 100 $PARQUET_FILE > parquet-analysis/sample-100.txt | |
| # Generate head records (first 100) | |
| echo "Generating first 100 records..." | |
| pqrs head --records 100 $PARQUET_FILE > parquet-analysis/head-100.txt | |
| # Generate JSON versions too | |
| echo "Generating JSON formatted outputs..." | |
| pqrs schema --json $PARQUET_FILE > parquet-analysis/schema.json | |
| pqrs sample --json --records 100 $PARQUET_FILE > parquet-analysis/sample-100.json | |
| pqrs head --json --records 100 $PARQUET_FILE > parquet-analysis/head-100.json | |
| # List the generated files | |
| echo "Generated analysis files:" | |
| ls -la parquet-analysis/ | |
| - name: Upload parquet analysis results | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: parquet-analysis-${{ matrix.data-type }}-results | |
| path: parquet-analysis/ | |
| retention-days: 28 | |
| generate-memory-usage-plots: | |
| name: Generate Memory Usage Plots | |
| needs: [run-workload] | |
| runs-on: ubuntu-latest | |
| env: | |
| TIME_DIFF: ${{ needs.run-workload.outputs.time-diff || '0' }} | |
| container: | |
| image: rocker/tidyverse:latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Install R Dependencies | |
| run: | | |
| # Install only the additional packages needed (tidyverse is already included in base image) | |
| R -e "install.packages(c('scales', 'nanoparquet', 'gridExtra', 'stringr'), repos='https://cloud.r-project.org/')" | |
| - name: Download Collector Parquet | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: collector-parquet-results | |
| path: collector_data | |
| - name: Generate Memory Usage Plots | |
| run: | | |
| mkdir -p memory_usage_results | |
| # The collector is ready TIME_DIFF seconds before the load generator is ready | |
| TIME_DIFF=${TIME_DIFF:-0} | |
| # Calculate adjusted start times relative to when collector is ready | |
| START_TIME_1=$((20 + TIME_DIFF)) | |
| START_TIME_2=$((180 + TIME_DIFF)) | |
| echo "Using adjusted start times: $START_TIME_1 and $START_TIME_2 seconds after collector is ready" | |
| echo "Time difference between collector and load generator ready states: ${TIME_DIFF} seconds" | |
| # Generate memory usage plots (LLC misses and cache references) with the adjusted start times | |
| for i in 0.3 0.5 1 3 10; do | |
| Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_1 $i memory_usage_results/memory_usage_20sec_$i || true | |
| Rscript scripts/plot_memory_usage.R collector_data/collector-parquet.parquet $START_TIME_2 $i memory_usage_results/memory_usage_180sec_$i || true | |
| done | |
| # Check output | |
| ls -la memory_usage_results/ | |
| - name: Upload Memory Usage Plots | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: memory-usage-plots | |
| path: memory_usage_results/ | |
| retention-days: 28 | |
| stop-runner: | |
| name: Stop EC2 runner | |
| needs: [setup-runner, run-workload, generate-flamegraphs] # Now depends on the flamegraph job | |
| runs-on: ubuntu-latest | |
| if: always() # Run even if previous jobs fail | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Stop AWS Runner | |
| uses: ./.github/actions/aws-runner/cleanup | |
| with: | |
| runner-label: ${{ needs.setup-runner.outputs.runner-label }} | |
| ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }} | |
| github-token: ${{ secrets.REPO_ADMIN_TOKEN }} | |
| aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} | |
| aws-region: ${{ needs.setup-runner.outputs.region || secrets.AWS_REGION }} |