Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat(nri-resctrl-plugin): add integration tests #102

feat(nri-resctrl-plugin): add integration tests

feat(nri-resctrl-plugin): add integration tests #102

Workflow file for this run

name: test-nri-init
on:
pull_request:
paths:
- 'Cargo.toml'
- 'crates/nri-init/**'
- 'Dockerfile.nri-init'
- '.github/actions/setup-k3s/**'
- '.github/workflows/test-nri-init.yaml'
push:
branches:
- main
paths:
- 'Cargo.toml'
- 'crates/nri-init/**'
- 'Dockerfile.nri-init'
- '.github/actions/setup-k3s/**'
- '.github/workflows/test-nri-init.yaml'
workflow_dispatch:
jobs:
helm-lint:
name: Helm Chart Linting
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: 'latest'
- name: Lint Helm chart
run: |
helm lint charts/collector
- name: Lint with NRI disabled values
run: |
helm lint charts/collector -f charts/collector/ci/nri-disabled-values.yaml
- name: Lint with NRI configure-only values
run: |
helm lint charts/collector -f charts/collector/ci/nri-configure-only-values.yaml
- name: Lint with NRI full setup values
run: |
helm lint charts/collector -f charts/collector/ci/nri-full-setup-values.yaml
helm-template:
name: Test Helm Template Rendering
runs-on: ubuntu-latest
strategy:
matrix:
values:
- name: "Default values"
file: ""
configure_expected: "false"
restart_expected: "false"
- name: "NRI disabled"
file: "charts/collector/ci/nri-disabled-values.yaml"
configure_expected: "false"
restart_expected: "false"
- name: "NRI configure only"
file: "charts/collector/ci/nri-configure-only-values.yaml"
configure_expected: "true"
restart_expected: "false"
- name: "NRI full setup"
file: "charts/collector/ci/nri-full-setup-values.yaml"
configure_expected: "true"
restart_expected: "true"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install prerequisites
uses: awalsh128/cache-apt-pkgs-action@v1
with:
packages: jq
version: 1.0
- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: 'latest'
- name: Template chart - ${{ matrix.values.name }}
run: |
if [ -z "${{ matrix.values.file }}" ]; then
helm template test-release charts/collector > /tmp/rendered.yaml
else
helm template test-release charts/collector -f ${{ matrix.values.file }} > /tmp/rendered.yaml
fi
echo "=== Rendered template ==="
cat /tmp/rendered.yaml
- name: Verify expected content
run: |
# Basic presence of the init container
if grep -q "name: nri-init" /tmp/rendered.yaml; then
echo "✓ Found init container name"
else
echo "✗ Missing init container name"
exit 1
fi
# Verify env values adjacent to their names
CE='${{ matrix.values.configure_expected }}'
RE='${{ matrix.values.restart_expected }}'
awk -v ce="$CE" -v re="$RE" '
/name: NRI_CONFIGURE/ { in_cfg=1; next }
in_cfg && /value: \"/ { if ($0 ~ "value: \"" ce "\"") cfg_ok=1; in_cfg=0 }
/name: NRI_RESTART/ { in_rst=1; next }
in_rst && /value: \"/ { if ($0 ~ "value: \"" re "\"") rst_ok=1; in_rst=0 }
END { if (!(cfg_ok && rst_ok)) exit 1 }
' /tmp/rendered.yaml && echo "✓ Env values match (configure=$CE, restart=$RE)" || { echo "✗ Env values mismatch"; exit 1; }
- name: Verify volume mounts
run: |
required_volumes="etc-containerd var-lib-rancher var-run"
for vol in $required_volumes; do
if grep -q "name: $vol" /tmp/rendered.yaml; then
echo "✓ Volume $vol found"
else
echo "✗ Volume $vol missing"
exit 1
fi
done
- name: Verify hostPID when restart enabled
if: ${{ matrix.values.restart_expected == 'true' }}
run: |
if grep -qE '\bhostPID: true\b' /tmp/rendered.yaml; then
echo "✓ hostPID: true present for restart=true"
else
echo "✗ hostPID not set while restart=true"
exit 1
fi
test-helm-install:
name: Test Helm Installation
runs-on: ubuntu-latest
needs: [build-test-image]
strategy:
matrix:
k8s_version: ["1.28", "1.29", "1.30", "1.31"]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Create kind cluster
uses: helm/kind-action@v1
with:
node_image: kindest/node:v${{ matrix.k8s_version }}.0
cluster_name: test-cluster-${{ matrix.k8s_version }}
- name: Download nri-init image artifact and load into Docker
uses: actions/download-artifact@v4
with:
name: nri-init-image
path: .
- name: Load image into Docker
run: |
docker load -i image.tar
- name: Install Helm chart with NRI disabled
env:
IMAGE: ${{ needs.build-test-image.outputs.image }}
run: |
# Prepare test image for KIND cluster and set chart image overrides
REPO="${IMAGE%:*}"
TAG="${IMAGE##*:}"
kind load docker-image "$IMAGE" --name test-cluster-${{ matrix.k8s_version }}
helm install test-collector charts/collector \
-f charts/collector/ci/nri-disabled-values.yaml \
--set image.repository=ghcr.io/${{ github.repository }}/collector \
--set image.tag=latest \
--set nri.init.image.repository="$REPO" \
--set nri.init.image.tag="$TAG" \
--set-string nri.init.command[0]="/usr/local/bin/nri-init" \
--wait --timeout 2m
- name: Check pod status / wait for init completion
run: |
kubectl get pods -l app.kubernetes.io/name=collector
POD=$(kubectl get pod -l app.kubernetes.io/name=collector --sort-by=.metadata.creationTimestamp -o name | tail -n 1 | cut -d/ -f2)
echo "Waiting for init container to complete..."
for i in {1..60}; do
STATUS=$(kubectl get pod $POD -o jsonpath='{.status.initContainerStatuses[?(@.name=="nri-init")].state}' 2>/dev/null || echo "{}")
if echo "$STATUS" | grep -q "terminated"; then
echo "Init container completed"
break
fi
echo "Waiting for init container... ($i/60)"
sleep 2
done
- name: Check init container logs
run: |
# Get pod name
POD=$(kubectl get pod -l app.kubernetes.io/name=collector --sort-by=.metadata.creationTimestamp -o name | tail -n 1 | cut -d/ -f2)
echo "=== NRI Init Container Logs ==="
kubectl logs $POD -c nri-init
# Verify expected log messages (new nri-init format) and fail on explicit failure
if kubectl logs $POD -c nri-init | grep -q "nri-init failed"; then
echo "✗ nri-init failed (found failure marker in logs)"
NODE=$(kubectl get pod "$POD" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)
if [ -n "$NODE" ]; then
echo "=== containerd status on node: $NODE ==="
docker exec "$NODE" /bin/sh -lc 'systemctl status containerd || true'
echo "=== recent containerd logs (journalctl) ==="
docker exec "$NODE" /bin/sh -lc 'journalctl -u containerd --no-pager -n 200 || true'
fi
exit 1
fi
if kubectl logs $POD -c nri-init | grep -q "Configuration: configure=false, restart=false"; then
echo "✓ Detected expected configuration flags in logs (configure=false, restart=false)"
else
echo "✗ Expected 'Configuration: configure=false, restart=false' in init logs"
exit 1
fi
- name: Uninstall chart
run: |
helm uninstall test-collector
- name: Install Helm chart with NRI configure-and-restart
env:
IMAGE: ${{ needs.build-test-image.outputs.image }}
run: |
REPO="${IMAGE%:*}"
TAG="${IMAGE##*:}"
kind load docker-image "$IMAGE" --name test-cluster-${{ matrix.k8s_version }}
helm install test-collector charts/collector \
-f charts/collector/ci/nri-configure-only-values.yaml \
--set image.repository=ghcr.io/${{ github.repository }}/collector \
--set image.tag=latest \
--set nri.init.image.repository="$REPO" \
--set nri.init.image.tag="$TAG" \
--set nri.restart=true \
--set nri.failIfUnavailable=true \
--set-string nri.init.command[0]="/usr/local/bin/nri-init" \
--wait --timeout 2m
- name: Check configuration and restart
run: |
# Wait for pod to exist
echo "Waiting for collector pod to be created..."
for i in {1..30}; do
if kubectl get pod -l app.kubernetes.io/name=collector -o jsonpath='{.items[0].metadata.name}' 2>/dev/null; then
break
fi
echo "Waiting for pod... ($i/30)"
sleep 2
done
POD=$(kubectl get pod -l app.kubernetes.io/name=collector --sort-by=.metadata.creationTimestamp -o name | tail -n 1 | cut -d/ -f2)
echo "Waiting for nri-init completion markers..."
for i in {1..240}; do
LOGS=$(kubectl logs "$POD" -c nri-init 2>/dev/null || true)
if echo "$LOGS" | grep -q "nri-init failed"; then
echo "✗ nri-init failed (found failure marker)"
echo "$LOGS"
NODE=$(kubectl get pod "$POD" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)
if [ -n "$NODE" ]; then
echo "=== containerd status on node: $NODE ==="
docker exec "$NODE" /bin/sh -lc 'systemctl status containerd || true'
echo "=== recent containerd logs (journalctl) ==="
docker exec "$NODE" /bin/sh -lc 'journalctl -u containerd --no-pager -n 200 || true'
fi
exit 1
fi
if echo "$LOGS" | grep -q "nri-init done"; then
echo "✓ nri-init done observed"
break
fi
sleep 1
done
if ! kubectl logs "$POD" -c nri-init | grep -q "nri-init done"; then
echo "✗ nri-init completion marker not observed"
kubectl logs "$POD" -c nri-init || true
NODE=$(kubectl get pod "$POD" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)
if [ -n "$NODE" ]; then
echo "=== containerd status on node: $NODE ==="
docker exec "$NODE" /bin/sh -lc 'systemctl status containerd || true'
echo "=== recent containerd logs (journalctl) ==="
docker exec "$NODE" /bin/sh -lc 'journalctl -u containerd --no-pager -n 200 || true'
fi
exit 1
fi
echo "=== NRI Init Container Logs (Configure + Restart) ==="
kubectl logs $POD -c nri-init
# Verify configuration flags via structured log line
if kubectl logs $POD -c nri-init | grep -q "Configuration: configure=true, restart=true"; then
echo "✓ Detected expected configuration flags in logs (configure=true, restart=true)"
else
echo "✗ Expected 'Configuration: configure=true, restart=true' in init logs"
kubectl logs $POD -c nri-init || true
exit 1
fi
- name: Verify NRI socket present on KIND node after restart
run: |
set -x
NODE_NAME="test-cluster-${{ matrix.k8s_version }}-control-plane"
echo "Waiting for containerd active in node $NODE_NAME..."
# Best-effort wait; KIND nodes use systemd within container image
for i in {1..90}; do
if docker exec "$NODE_NAME" /bin/sh -lc 'systemctl is-active containerd' | grep -q '^active$'; then
echo "containerd active"
break
fi
sleep 1
done
echo "Waiting for NRI socket to appear..."
for i in {1..120}; do
if docker exec "$NODE_NAME" /bin/sh -lc 'test -S /var/run/nri/nri.sock'; then
docker exec "$NODE_NAME" ls -la /var/run/nri/nri.sock
echo "✓ NRI socket exists after restart"
exit 0
fi
sleep 1
done
echo "✗ NRI socket not found on KIND node after restart"
docker exec "$NODE_NAME" /bin/sh -lc 'ls -la /var/run/nri || true; head -n 200 /etc/containerd/config.toml || true'
exit 1
verify-nri-preconfigured:
name: Verify NRI Pre-configured on KIND v0.30
runs-on: ubuntu-latest
needs: [build-test-image]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download nri-init image artifact and load into Docker
uses: actions/download-artifact@v4
with:
name: nri-init-image
path: .
- name: Load image into Docker
run: |
docker load -i image.tar
- name: Create kind cluster (KIND v0.30)
uses: helm/kind-action@v1
with:
version: v0.30.0
cluster_name: kind-nri-preconfigured
- name: Verify NRI is already enabled
run: |
echo "=== Checking NRI status on KIND v0.30 cluster (docker exec) ==="
CONTAINER_NAME="kind-nri-preconfigured-control-plane"
# Check if NRI socket exists (must exist for pre-configured expectation)
if docker exec $CONTAINER_NAME ls -la /var/run/nri/nri.sock 2>/dev/null; then
echo "✓ NRI socket found at /var/run/nri/nri.sock"
else
echo "✗ NRI socket not found; expected pre-configured NRI on KIND v0.30"
docker exec $CONTAINER_NAME ls -la /var/run/nri/ 2>/dev/null || echo "NRI directory doesn't exist"
exit 1
fi
- name: Deploy collector with NRI disabled to verify it detects pre-configured NRI
env:
IMAGE: ${{ needs.build-test-image.outputs.image }}
run: |
echo "=== Deploying collector with NRI configuration disabled ==="
REPO="${IMAGE%:*}"
TAG="${IMAGE##*:}"
CONTAINER_NAME="kind-nri-preconfigured-control-plane"
# Load the image into KIND nodes
kind load docker-image "$IMAGE" --name "kind-nri-preconfigured"
# Install chart without waiting for Ready, since GH runners may prevent the
# main collector container from becoming Ready. We only need the init to run.
helm install test-collector charts/collector \
-f charts/collector/ci/nri-disabled-values.yaml \
--set image.repository=ghcr.io/${{ github.repository }}/collector \
--set image.tag=latest \
--set nri.init.image.repository="$REPO" \
--set nri.init.image.tag="$TAG" \
--set-string nri.init.command[0]="/usr/local/bin/nri-init"
# Wait for the pod to either become Ready or clearly fail, then evaluate init result
echo "Waiting for collector pod to be created..."
for i in {1..60}; do
POD=$(kubectl get pod -l app.kubernetes.io/name=collector -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -n "$POD" ]; then
echo "Found pod: $POD"
break
fi
sleep 2
done
if [ -z "$POD" ]; then
echo "✗ Collector pod was not created"
kubectl get pods -A || true
exit 1
fi
echo "Waiting for pod to be Ready or fail (ignoring collector failures)..."
for i in {1..180}; do
READY=$(kubectl get pod "$POD" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "")
WAITING_REASON=$(kubectl get pod "$POD" -o jsonpath='{.status.containerStatuses[?(@.name=="collector")].state.waiting.reason}' 2>/dev/null || echo "")
TERM_REASON=$(kubectl get pod "$POD" -o jsonpath='{.status.containerStatuses[?(@.name=="collector")].state.terminated.reason}' 2>/dev/null || echo "")
PHASE=$(kubectl get pod "$POD" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
# Ready -> proceed; CrashLoopBackOff/Err* or Terminated/Failed -> proceed as well
if [ "$READY" = "True" ] || [ "$PHASE" = "Failed" ] || [ "$WAITING_REASON" = "CrashLoopBackOff" ] || [ -n "$TERM_REASON" ]; then
echo "Pod condition reached (Ready=$READY, phase=$PHASE, waiting=$WAITING_REASON, terminated=$TERM_REASON)"
break
fi
sleep 2
done
# Get pod name
POD=$(kubectl get pod -l app.kubernetes.io/name=collector -o jsonpath='{.items[0].metadata.name}')
echo "=== NRI Init Container Logs ==="
kubectl logs $POD -c nri-init
# Check if init container detected pre-existing NRI (socket present)
if kubectl logs $POD -c nri-init | grep -q "NRI socket found at"; then
echo "✓ Init container detected pre-configured NRI"
else
echo "✗ Init container did not detect pre-configured NRI"
kubectl logs $POD -c nri-init || true
exit 1
fi
# Explicitly ignore collector container failures on GH runners
echo "=== Main container logs (ignored for pass/fail) ==="
kubectl logs "$POD" --tail=200 || true
helm uninstall test-collector
build-test-image:
name: Build nri-init image and binary (reusable)
uses: ./.github/workflows/build-component-artifacts.yaml
permissions:
contents: read
packages: read
with:
runner: ubuntu-latest
component: nri-init
push: false
upload-image-artifact: true
upload-binary-artifact: true
unit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- name: Run unit tests
run: cargo test -p nri-init --lib --verbose
# Safe integration tests (no system services), operate in temp dirs
integration-sim:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- name: Run simulated integration tests
run: cargo test -p nri-init --test integration_sim --verbose
# Real environment tests (require self-hosted runner with systemd privileges)
integration-real:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- name: K3s tests
run: cargo test -p nri-init --test k3s -- --ignored --test-threads=1
- name: Containerd tests
run: cargo test -p nri-init --test containerd -- --ignored --test-threads=1
# GH-hosted integration tests that set up KIND and K3s and run the nri-init binary
integration-matrix:
name: Rust NRI Integration (${{ matrix.target }} / ${{ matrix.scenario }})
runs-on: ubuntu-latest
needs: [build-test-image]
strategy:
fail-fast: false
matrix:
include:
# KIND
- target: kind
scenario: configure-only
restart: false
runner: container
- target: kind
scenario: configure-only
restart: false
runner: binary
- target: kind
scenario: configure-and-restart
restart: true
runner: container
- target: kind
scenario: configure-and-restart
restart: true
runner: binary
# K3s
- target: k3s
scenario: configure-only
restart: false
runner: binary
- target: k3s
scenario: configure-only
restart: false
runner: container
- target: k3s
scenario: configure-and-restart
restart: true
runner: binary
- target: k3s
scenario: configure-and-restart
restart: true
runner: container
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download nri-init binary artifact
if: ${{ matrix.runner == 'binary' }}
uses: actions/download-artifact@v4
with:
name: nri-init-binary
path: ./bin
- name: Make nri-init executable
if: ${{ matrix.runner == 'binary' }}
run: chmod +x ./bin/nri-init
- name: Install kubectl
if: ${{ matrix.target == 'kind' || matrix.target == 'k3s' }}
run: |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/kubectl
kubectl version --client
- name: Ensure Docker is running
if: ${{ matrix.target == 'kind' }}
run: |
set -x
set +e
(sudo systemctl start docker || sudo service docker start || true)
docker info
# KIND target setup
- name: Install KIND
if: ${{ matrix.target == 'kind' }}
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
kind version
- name: Create KIND cluster (no NRI)
if: ${{ matrix.target == 'kind' }}
env:
CLUSTER: nri-rust-${{ matrix.scenario }}
run: |
# Expand ${CLUSTER} inside the config
cat > kind-config.yaml << EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: ${CLUSTER}
nodes:
- role: control-plane
image: kindest/node:v1.31.0@sha256:53df588e04085fd41ae12de0c3fe4c72f7013bba32a20e7325357a1ac94ba865
EOF
kind create cluster --config kind-config.yaml --wait 300s
kubectl cluster-info --context kind-${CLUSTER}
kubectl get nodes
- name: Download nri-init image artifact and load into Docker (KIND target)
if: ${{ matrix.target == 'kind' }}
uses: actions/download-artifact@v4
with:
name: nri-init-image
path: .
- name: Load image into Docker (KIND target)
if: ${{ matrix.target == 'kind' }}
run: |
docker load -i image.tar
- name: Load test image into KIND
if: ${{ matrix.target == 'kind' }}
env:
IMAGE: ${{ needs.build-test-image.outputs.image }}
CLUSTER: nri-rust-${{ matrix.scenario }}
run: |
set -x
kind load docker-image "$IMAGE" --name ${CLUSTER}
- name: Run nri-init as privileged DaemonSet on KIND (container mode)
if: ${{ matrix.target == 'kind' && matrix.runner == 'container' }}
env:
TEST_IMAGE: ${{ needs.build-test-image.outputs.image }}
run: |
echo '=== Cluster nodes ==='
kubectl get nodes -o wide || true
echo '=== System pods ==='
kubectl get pods -A || true
# Expand ${TEST_IMAGE} into the manifest
cat > ds.yaml << EOF
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nri-init-test
namespace: kube-system
labels: { app: nri-init-test }
spec:
selector:
matchLabels: { app: nri-init-test }
template:
metadata:
labels: { app: nri-init-test }
spec:
hostPID: true
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: runner
image: ${TEST_IMAGE}
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
command: ["/bin/sh", "-lc"]
args:
- >-
/usr/local/bin/nri-init --log-level debug --configure --nsenter-path nsenter $( [ "${{ matrix.restart }}" = "true" ] && echo "--restart --fail-if-unavailable" || echo "--no-restart" ) || true;
echo "sleeping to keep pod Ready";
sleep 3600
volumeMounts:
- { name: host-root, mountPath: /host }
- { name: etc-containerd, mountPath: /etc/containerd }
- { name: run-nri, mountPath: /var/run/nri }
volumes:
- name: host-root
hostPath: { path: /, type: Directory }
- name: etc-containerd
hostPath: { path: /etc/containerd, type: DirectoryOrCreate }
- name: run-nri
hostPath: { path: /var/run/nri, type: DirectoryOrCreate }
EOF
set +e
kubectl apply -f ds.yaml
# Wait up to 60s for the DaemonSet to be Available
kubectl -n kube-system rollout status ds/nri-init-test --timeout=60s
ROLLOUT=$?
POD=$(kubectl -n kube-system get pod -l app=nri-init-test -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
if [ "$ROLLOUT" != "0" ]; then
echo '--- DS Status ---'
kubectl -n kube-system get ds nri-init-test -o wide || true
kubectl -n kube-system describe ds nri-init-test || true
echo '--- Pods ---'
kubectl -n kube-system get pods -l app=nri-init-test -o wide || true
echo '--- Recent Events ---'
kubectl -n kube-system get events --sort-by=.lastTimestamp | tail -n 50 || true
if [ -n "$POD" ]; then
echo '--- Pod Describe ---'
kubectl -n kube-system describe pod "$POD" || true
echo '--- Pod Logs ---'
kubectl -n kube-system logs "$POD" || true
fi
exit 1
fi
echo "=== Wait for nri-init completion (KIND DS) ==="
for i in {1..120}; do
LOGS=$(kubectl -n kube-system logs "$POD" || true)
if echo "$LOGS" | grep -q "nri-init failed"; then
echo "✗ Detected 'nri-init failed' in logs"
echo "$LOGS"
exit 1
fi
if echo "$LOGS" | grep -q "nri-init done"; then
echo "✓ nri-init done observed in logs"
break
fi
sleep 1
done
if ! kubectl -n kube-system logs "$POD" | grep -q "nri-init done"; then
echo "✗ Did not observe 'nri-init done' in logs within 120s"
kubectl -n kube-system logs "$POD" || true
exit 1
fi
echo "=== nri-init logs (KIND DS) ==="
kubectl -n kube-system logs "$POD" || true
# Note: nri-init may report a transient failure immediately after restart.
# Treat final socket presence as the source of truth.
echo "=== Verify host containerd config patched ==="
kubectl -n kube-system exec "$POD" -- sh -lc 'test -f /etc/containerd/config.toml && grep -q "plugins.\"io.containerd.nri.v1.nri\"" /etc/containerd/config.toml && grep -q "disable = false" /etc/containerd/config.toml'
# If restart requested, wait for containerd active on node, then verify socket appears
if [ "${{ matrix.restart }}" = "true" ]; then
echo "=== Waiting for containerd to be active (KIND) ==="
for i in {1..90}; do
if kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- systemctl is-active containerd' | grep -q '^active$'; then
echo "✓ containerd is active on node"
break
fi
sleep 1
done
kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- systemctl status containerd || true'
for i in {1..120}; do
if kubectl -n kube-system exec "$POD" -- sh -lc 'test -S /var/run/nri/nri.sock'; then
echo "✓ NRI socket exists after restart"
break
fi
sleep 1
done
if ! kubectl -n kube-system exec "$POD" -- sh -lc 'test -S /var/run/nri/nri.sock'; then
echo "✗ NRI socket not found after restart"
echo '--- DS Status ---'
kubectl -n kube-system get ds nri-init-test -o wide || true
kubectl -n kube-system describe ds nri-init-test || true
echo '--- Pod Logs ---'
kubectl -n kube-system logs "$POD" || true
echo '--- containerd service status via nsenter ---'
# Try to inspect containerd service on the node
kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- systemctl status containerd || true'
echo '--- containerd process list on node ---'
kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- ps aux | grep -E "containerd( |$)" || true'
exit 1
fi
fi
- name: Run nri-init directly inside KIND node (binary mode)
if: ${{ matrix.target == 'kind' && matrix.runner == 'binary' }}
env:
CLUSTER: nri-rust-${{ matrix.scenario }}
run: |
set -x
NODE=${CLUSTER}-control-plane
# Copy binary into KIND node
docker cp ./bin/nri-init ${NODE}:/usr/local/bin/nri-init
# Sanity on node environment
docker exec ${NODE} /bin/sh -lc "id && uname -a && containerd --version || true"
docker exec ${NODE} /bin/sh -lc "ls -la /etc/containerd || true && head -n 60 /etc/containerd/config.toml || true"
# Run configure (and optional restart) inside the node (explicit config path)
docker exec ${NODE} /bin/sh -lc "\
/usr/local/bin/nri-init --log-level debug --mode containerd --containerd-config /etc/containerd/config.toml --configure $( [ '${{ matrix.restart }}' = 'true' ] && echo '--restart --fail-if-unavailable' || echo '--no-restart' )"
BIN_RC=$?
if [ "${{ matrix.restart }}" = "true" ] && [ "$BIN_RC" -ne 0 ]; then
echo "✗ nri-init binary failed in restart mode (rc=$BIN_RC)"
exit 1
fi
echo '=== Show containerd config ==='
docker exec ${NODE} /bin/sh -lc "head -n 200 /etc/containerd/config.toml || true"
# Verify config edited (print context on failure)
if ! docker exec ${NODE} /bin/sh -lc "grep -q 'plugins.\"io.containerd.nri.v1.nri\"' /etc/containerd/config.toml && grep -q 'disable = false' /etc/containerd/config.toml"; then
echo 'Final /etc/containerd/config.toml:'
docker exec ${NODE} /bin/sh -lc 'sed -n "1,200p" /etc/containerd/config.toml || true'
exit 1
fi
# If restart requested, best-effort check for socket (may be unsupported in KIND)
if [ "${{ matrix.restart }}" = "true" ]; then
if docker exec ${NODE} /bin/sh -lc "test -S /var/run/nri/nri.sock"; then
echo "✓ NRI socket present after restart"
else
echo "ℹ NRI socket missing; restart may be NotSupported in KIND"
fi
fi
- name: Cleanup KIND
if: ${{ always() && matrix.target == 'kind' }}
env:
CLUSTER: nri-rust-${{ matrix.scenario }}
run: |
kind delete cluster --name ${CLUSTER} || true
# K3s target setup
- name: Setup k3s (pinned)
if: ${{ matrix.target == 'k3s' }}
uses: ./.github/actions/setup-k3s
with:
k3s_version: v1.31.5+k3s1
kubeconfig_path: /etc/rancher/k3s/k3s.yaml
disable_packaged_addons: true
preflight_inotify: true
timeout_api_server_ready_seconds: 300
timeout_node_ready_seconds: 300
- name: Copy kubeconfig for non-root use
if: ${{ matrix.target == 'k3s' }}
run: |
mkdir -p ~/.kube
sudo cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
sudo chown $(id -u):$(id -g) ~/.kube/config
- name: Download nri-init image artifact (K3s)
if: ${{ matrix.target == 'k3s' }}
uses: actions/download-artifact@v4
with:
name: nri-init-image
path: .
- name: Import image into k3s containerd
if: ${{ matrix.target == 'k3s' }}
run: |
# Import the image archive into the k3s-managed containerd (k8s.io namespace)
sudo k3s ctr -n k8s.io images import image.tar
sudo k3s ctr -n k8s.io images ls | head -n 50 || true
- name: Run nri-init on host (K3s, binary mode)
if: ${{ matrix.target == 'k3s' && matrix.runner == 'binary' }}
run: |
echo "=== Run nri-init for K3s (configure=true restart=${{ matrix.restart }}) ==="
set -x
FAILED=0
if ! sudo ./bin/nri-init --log-level debug \
--mode k3s \
$( [ "${{ matrix.restart }}" = "true" ] && echo "--restart" || echo "--no-restart" ) \
--configure \
$( [ "${{ matrix.restart }}" = "true" ] && echo "--fail-if-unavailable" ); then
echo "nri-init (k3s) failed; dumping k3s logs and template"
sudo journalctl -u k3s --no-pager | tail -n 200 || true
sudo ls -l /var/lib/rancher/k3s/agent/etc/containerd || true
sudo sed -n '1,200p' /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl || true
sudo sed -n '1,200p' /var/lib/rancher/k3s/agent/etc/containerd/config-v3.toml.tmpl || true
FAILED=1
fi
echo "=== Verify K3s template patched ==="
sudo ls -la /var/lib/rancher/k3s/agent/etc/containerd || true
sudo head -n 80 /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl 2>/dev/null || true
sudo head -n 80 /var/lib/rancher/k3s/agent/etc/containerd/config-v3.toml.tmpl 2>/dev/null || true
if [ -f "/var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl" ]; then
if sudo grep -q 'plugins."io.containerd.nri.v1.nri"' /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl; then
echo "✓ NRI section present in template"
else
echo "⚠ NRI section missing from K3s template (configure-only); proceeding"
sudo head -n 80 /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl || true
fi
fi
if [ "${{ matrix.restart }}" = "false" ]; then
echo "=== Verify socket absent when restart=false ==="
if sudo test -S /var/run/nri/nri.sock; then
echo "✗ Socket present but restart=false"
exit 1
else
echo "✓ Socket absent as expected (no restart)"
fi
else
echo "=== Verify socket after restart ==="
for i in {1..30}; do
if sudo test -S /var/run/nri/nri.sock; then
sudo ls -la /var/run/nri/nri.sock
echo "✓ NRI socket exists after restart"
break
fi
sleep 1
done
if ! sudo test -S /var/run/nri/nri.sock; then
echo "✗ NRI socket not found after restart on K3s"
exit 1
fi
fi
if [ "$FAILED" = "1" ]; then
echo "nri-init returned failure earlier, but templates/socket checks passed; continuing"
fi
- name: Run nri-init as DaemonSet on K3s (container mode)
if: ${{ matrix.target == 'k3s' && matrix.runner == 'container' }}
env:
TEST_IMAGE: ${{ needs.build-test-image.outputs.image }}
run: |
set -x
kubectl get nodes -o wide || true
# Deploy DaemonSet that mounts k3s template dir and runs nri-init
cat > ds-k3s.yaml << EOF
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nri-init-test-k3s
namespace: kube-system
labels: { app: nri-init-test-k3s }
spec:
selector:
matchLabels: { app: nri-init-test-k3s }
template:
metadata:
labels: { app: nri-init-test-k3s }
spec:
hostPID: true
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: runner
image: ${TEST_IMAGE}
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
command: ["/bin/sh", "-lc"]
args:
- >-
/usr/local/bin/nri-init --log-level debug --mode k3s --configure --nsenter-path nsenter $( [ "${{ matrix.restart }}" = "true" ] && echo "--restart --fail-if-unavailable" || echo "--no-restart" ) || true;
echo "sleeping to keep pod Ready";
sleep 3600
volumeMounts:
- { name: host-root, mountPath: /host }
- { name: k3s-containerd, mountPath: /var/lib/rancher/k3s/agent/etc/containerd }
- { name: run-nri, mountPath: /var/run/nri }
volumes:
- name: host-root
hostPath: { path: /, type: Directory }
- name: k3s-containerd
hostPath: { path: /var/lib/rancher/k3s/agent/etc/containerd, type: DirectoryOrCreate }
- name: run-nri
hostPath: { path: /var/run/nri, type: DirectoryOrCreate }
EOF
kubectl apply -f ds-k3s.yaml
kubectl -n kube-system rollout status ds/nri-init-test-k3s --timeout=60s || true
echo "=== Waiting for DaemonSet pod to appear (K3s) ==="
for i in {1..60}; do
POD=$(kubectl -n kube-system get pod -l app=nri-init-test-k3s -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -n "$POD" ]; then
echo "Found pod: $POD"
break
fi
sleep 2
done
if [ -z "$POD" ]; then
echo "✗ Timed out waiting for DaemonSet pod"
kubectl -n kube-system get pods -l app=nri-init-test-k3s -o wide || true
exit 1
fi
echo "=== Wait for nri-init completion (K3s DS) ==="
for i in {1..180}; do
LOGS=$(kubectl -n kube-system logs "$POD" || true)
if echo "$LOGS" | grep -q "nri-init failed"; then
echo "✗ Detected 'nri-init failed' in logs"
echo "$LOGS"
exit 1
fi
if echo "$LOGS" | grep -q "nri-init done"; then
echo "✓ nri-init done observed in logs"
break
fi
sleep 1
done
if ! kubectl -n kube-system logs "$POD" | grep -q "nri-init done"; then
echo "✗ Did not observe 'nri-init done' in logs within 180s"
kubectl -n kube-system logs "$POD" || true
exit 1
fi
echo "=== nri-init logs (k3s DS) ==="
kubectl -n kube-system logs "$POD" || true
# Note: nri-init may report a transient failure immediately after restart.
# Treat final socket presence as the source of truth.
echo "=== Verify K3s template patched ==="
kubectl -n kube-system exec "$POD" -- sh -lc 'ls -la /var/lib/rancher/k3s/agent/etc/containerd || true'
kubectl -n kube-system exec "$POD" -- sh -lc 'test -f /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl || test -f /var/lib/rancher/k3s/agent/etc/containerd/config-v3.toml.tmpl'
# Socket expectations based on restart flag (after ensuring containerd is active)
if [ "${{ matrix.restart }}" = "false" ]; then
if kubectl -n kube-system exec "$POD" -- sh -lc 'test -S /var/run/nri/nri.sock'; then
echo "✗ Socket present but restart=false"
exit 1
fi
else
echo "=== Waiting for containerd to be active (K3s) ==="
for i in {1..90}; do
if kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- systemctl is-active containerd' | grep -q '^active$'; then
echo "✓ containerd is active on node"
break
fi
sleep 1
done
kubectl -n kube-system exec "$POD" -- sh -lc 'nsenter --target 1 --mount --uts --ipc --net --pid -- systemctl status containerd || true'
for i in {1..90}; do
if kubectl -n kube-system exec "$POD" -- sh -lc 'test -S /var/run/nri/nri.sock'; then
echo "✓ NRI socket exists after restart"
break
fi
sleep 1
done
if ! kubectl -n kube-system exec "$POD" -- sh -lc 'test -S /var/run/nri/nri.sock'; then
echo "✗ NRI socket not found after restart"
exit 1
fi
fi