end-to-end gpu driver testing enhancement

Signed-off-by: shiva kumar <[email protected]>
NVIDIA · Aug 17, 2024 · 13839ac · 13839ac
1 parent 600b7bf
commit 13839ac
Show file tree

Hide file tree

Showing 9 changed files with 346 additions and 44 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: CI
+name: End-to-end tests
 
 on:
   workflow_run:
@@ -21,15 +21,10 @@ on:
       - completed
     branches:
       - main
-
+ 
 jobs:
   e2e-tests-nvidiadriver:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        driver:
-          - 535.183.06
-          - 550.90.07
 
     steps:
     - name: Check out code
@@ -41,7 +36,6 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
-        AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
       with:
         aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
         aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -59,11 +53,32 @@ jobs:
         echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
         echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
         echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
+        DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
+        echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
         
     - name: Validate gpu driver
       env:
         TEST_CASE: "./tests/cases/nvidia-driver.sh"
       run: |
         sudo chmod 644 ${{ github.workspace }}/.cache/key
         echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
-        ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
+        rc=0
+        for driver_version in ${DRIVER_VERSIONS}; do
+          echo "Running e2e for DRIVER_VERSION=$driver_version"
+          ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
+          if [ $status -ne 0 ]; then
+            echo "e2e validation failed for driver version $driver_version with status $status"
+            rc=$status
+          fi
+        done
+        source ./tests/scripts/.definitions.sh
+        ./tests/scripts/pull.sh ${LOG_DIR} logs
+        exit $rc
+
+    - name: Archive test logs
+      if: ${{ failure() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: nvidiadriver-e2e-test-logs
+        path: ./logs/
+        retention-days: 15  
diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh
@@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
 : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}
 
 : ${TARGET_DRIVER_VERSION:="550.90.07"}
+
+: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
+: ${POD_STATUS_TIME_OUT:="2m"}
+
+: ${LOG_DIR:="/tmp/logs"}
diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh
@@ -2,35 +2,20 @@
 
 check_pod_ready() {
 	local pod_label=$1
-	local current_time=0
-	while :; do
-		echo "Checking $pod_label pod"
-		kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
+	local pod_status_time_out=$2
+
+	echo "Checking $pod_label pod"
+
+	kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
 
-		echo "Checking $pod_label pod readiness"
-		is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
+	echo "Checking $pod_label pod readiness"
 
-		if [ "${is_pod_ready}" = "True" ]; then
-			# Check if the pod is not in terminating state
-			is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
-			if [ "${is_pod_terminating}" != "" ]; then
-				echo "pod $pod_label is in terminating state..."
-			else
-				echo "Pod $pod_label is ready"
-				break;
-			fi
-		fi
-
-		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
-			echo "timeout reached"
-			exit 1;
-		fi
-
-		# Echo useful information on stdout
+	if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then
+		return 0	
+	else
+		# print status of pod
 		kubectl get pods -n ${TEST_NAMESPACE}
+	fi
 
-		echo "Sleeping 5 seconds"
-		current_time=$((${current_time} + 5))
-		sleep 5
-	done
+	return 1
 }
diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh
@@ -7,8 +7,12 @@ echo ""
 echo ""
 echo "--------------Installing the GPU Operator--------------"
 
-# Install the operator with usePrecompiled mode set to true
 ${SCRIPT_DIR}/install-operator.sh
 
 "${SCRIPT_DIR}"/verify-operator.sh
+
+echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"
+
+${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"
+
 echo "--------------Verification completed for GPU Operator--------------"
diff --git a/tests/scripts/must-gather.sh b/tests/scripts/must-gather.sh
@@ -0,0 +1,256 @@
+#!/usr/bin/env bash
+
+set -o nounset
+set -x
+
+K=kubectl
+if ! $K version > /dev/null; then
+    K=oc
+
+    if ! $K version > /dev/null; then
+        echo "FATAL: neither 'kubectl' nor 'oc' appear to be working properly. Exiting ..."
+        exit 1
+    fi
+fi
+
+if [[ "$0" == "/usr/bin/gather" ]]; then
+    echo "Running as must-gather plugin image"
+    export ARTIFACT_DIR=/must-gather
+else
+    if [ -z "${ARTIFACT_DIR:-}" ]; then
+        export ARTIFACT_DIR="/tmp/nvidia-gpu-operator_$(date +%Y%m%d_%H%M)"
+    fi
+    echo "Using ARTIFACT_DIR=$ARTIFACT_DIR"
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+
+echo
+
+exec 1> >(tee $ARTIFACT_DIR/must-gather.log)
+exec 2> $ARTIFACT_DIR/must-gather.stderr.log
+
+if [[ "$0" == "/usr/bin/gather" ]]; then
+    echo "NVIDIA GPU Operator" > $ARTIFACT_DIR/version
+    echo "${VERSION:-N/A}" >> $ARTIFACT_DIR/version
+fi
+
+ocp_cluster=$($K get clusterversion/version --ignore-not-found -oname || true)
+
+if [[ "$ocp_cluster" ]]; then
+    echo "Running in OpenShift."
+    echo "Get the cluster version"
+    $K get clusterversion/version -oyaml > $ARTIFACT_DIR/openshift_version.yaml
+fi
+
+echo "Get the operator namespaces"
+OPERATOR_POD_NAME=$($K get pods -lapp=gpu-operator -oname -A)
+
+if [ -z "$OPERATOR_POD_NAME" ]; then
+    echo "FATAL: could not find the GPU Operator Pod ..."
+    exit 1
+fi
+
+OPERATOR_NAMESPACE=$($K get pods -lapp=gpu-operator -A -ojsonpath={.items[].metadata.namespace} --ignore-not-found)
+
+echo "Using '$OPERATOR_NAMESPACE' as operator namespace"
+echo ""
+
+echo "#"
+echo "# ClusterPolicy"
+echo "#"
+echo
+
+CLUSTER_POLICY_NAME=$($K get clusterpolicy -oname)
+
+if [[ "$CLUSTER_POLICY_NAME" ]]; then
+    echo "Get $CLUSTER_POLICY_NAME"
+    $K get -oyaml $CLUSTER_POLICY_NAME > $ARTIFACT_DIR/cluster_policy.yaml
+else
+    echo "Mark the ClusterPolicy as missing"
+    touch $ARTIFACT_DIR/cluster_policy.missing
+fi
+
+echo
+echo "#"
+echo "# Nodes and machines"
+echo "#"
+echo
+
+if [ "$ocp_cluster" ]; then
+    echo "Get all the machines"
+    $K get machines -A > $ARTIFACT_DIR/all_machines.list
+fi
+
+echo "Get the labels of the nodes with NVIDIA PCI cards"
+
+GPU_PCI_LABELS=(feature.node.kubernetes.io/pci-10de.present feature.node.kubernetes.io/pci-0302_10de.present feature.node.kubernetes.io/pci-0300_10de.present)
+
+gpu_pci_nodes=""
+for label in ${GPU_PCI_LABELS[@]}; do
+    gpu_pci_nodes="$gpu_pci_nodes $($K get nodes -l$label -oname)"
+done
+
+if [ -z "$gpu_pci_nodes" ]; then
+    echo "FATAL: could not find nodes with NVIDIA PCI labels"
+    exit 0
+fi
+
+for node in $(echo "$gpu_pci_nodes"); do
+    echo "$node" | cut -d/ -f2 >> $ARTIFACT_DIR/gpu_nodes.labels
+    $K get $node '-ojsonpath={.metadata.labels}' \
+        | sed 's|,|,- |g' \
+        | tr ',' '\n' \
+        | sed 's/{"/- /' \
+        | tr : = \
+        | sed 's/"//g' \
+        | sed 's/}/\n/' \
+              >> $ARTIFACT_DIR/gpu_nodes.labels
+    echo "" >> $ARTIFACT_DIR/gpu_nodes.labels
+done
+
+echo "Get the GPU nodes (status)"
+$K get nodes -l nvidia.com/gpu.present=true -o wide > $ARTIFACT_DIR/gpu_nodes.status
+
+echo "Get the GPU nodes (description)"
+$K describe nodes -l nvidia.com/gpu.present=true > $ARTIFACT_DIR/gpu_nodes.descr
+
+echo ""
+echo "#"
+echo "# Operator Pod"
+echo "#"
+echo
+
+echo "Get the GPU Operator Pod (status)"
+$K get $OPERATOR_POD_NAME \
+    -owide \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operator_pod.status
+
+echo "Get the GPU Operator Pod (yaml)"
+$K get $OPERATOR_POD_NAME \
+    -oyaml \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operator_pod.yaml
+
+echo "Get the GPU Operator Pod logs"
+$K logs $OPERATOR_POD_NAME \
+    -n $OPERATOR_NAMESPACE \
+    > "$ARTIFACT_DIR/gpu_operator_pod.log"
+
+$K logs $OPERATOR_POD_NAME \
+    -n $OPERATOR_NAMESPACE \
+    --previous \
+    > "$ARTIFACT_DIR/gpu_operator_pod.previous.log"
+
+echo ""
+echo "#"
+echo "# Operand Pods"
+echo "#"
+echo ""
+
+echo "Get the Pods in $OPERATOR_NAMESPACE (status)"
+$K get pods -owide \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operand_pods.status
+
+echo "Get the Pods in $OPERATOR_NAMESPACE (yaml)"
+$K get pods -oyaml \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operand_pods.yaml
+
+echo "Get the GPU Operator Pods Images"
+$K get pods -n $OPERATOR_NAMESPACE \
+    -o=jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}' \
+    > $ARTIFACT_DIR/gpu_operand_pod_images.txt
+
+echo "Get the description and logs of the GPU Operator Pods"
+
+for pod in $($K get pods -n $OPERATOR_NAMESPACE -oname);
+do
+    if ! $K get $pod -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then
+        echo "Skipping $pod, not a NVIDA/GPU Pod ..."
+        continue
+    fi
+    pod_name=$(echo "$pod" | cut -d/ -f2)
+
+    if [ $pod == $OPERATOR_POD_NAME ]; then
+        echo "Skipping operator pod $pod_name ..."
+        continue
+    fi
+
+    $K logs $pod \
+        -n $OPERATOR_NAMESPACE \
+        --all-containers --prefix \
+        > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.log
+
+    $K logs $pod \
+        -n $OPERATOR_NAMESPACE \
+        --all-containers --prefix \
+        --previous \
+        > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.previous.log
+
+    $K describe $pod \
+        -n $OPERATOR_NAMESPACE \
+        > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.descr
+done
+
+echo ""
+echo "#"
+echo "# Operand DaemonSets"
+echo "#"
+echo ""
+
+echo "Get the DaemonSets in $OPERATOR_NAMESPACE (status)"
+
+$K get ds \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operand_ds.status
+
+
+echo "Get the DaemonSets in $OPERATOR_NAMESPACE (yaml)"
+
+$K get ds -oyaml \
+    -n $OPERATOR_NAMESPACE \
+    > $ARTIFACT_DIR/gpu_operand_ds.yaml
+
+echo "Get the description of the GPU Operator DaemonSets"
+
+for ds in $($K get ds -n $OPERATOR_NAMESPACE -oname);
+do
+    if ! $K get $ds -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then
+        echo "Skipping $ds, not a NVIDA/GPU DaemonSet ..."
+        continue
+    fi
+    $K describe $ds \
+        -n $OPERATOR_NAMESPACE \
+        > $ARTIFACT_DIR/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr
+done
+
+echo ""
+echo "#"
+echo "# nvidia-bug-report.sh"
+echo "#"
+echo ""
+
+for pod in $($K get pods -lopenshift.driver-toolkit -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-driver-daemonset -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n $OPERATOR_NAMESPACE);
+do
+    pod_nodename=$($K get $pod -ojsonpath={.spec.nodeName} -n $OPERATOR_NAMESPACE)
+    echo "Saving nvidia-bug-report from ${pod_nodename} ..."
+
+    $K exec -n $OPERATOR_NAMESPACE $pod -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \
+        (echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue)
+
+    $K cp $OPERATOR_NAMESPACE/$(basename $pod):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
+        (echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue)
+
+    mv /tmp/nvidia-bug-report.log.gz $ARTIFACT_DIR/nvidia-bug-report_${pod_nodename}.log.gz
+done
+
+echo ""
+echo "#"
+echo "# All done!"
+if [[ "$0" != "/usr/bin/gather" ]]; then
+    echo "# Logs saved into ${ARTIFACT_DIR}."
+fi
+echo "#"