end-to-end gpu driver testing enhancement

Signed-off-by: shiva kumar <[email protected]>
NVIDIA · Aug 16, 2024 · b042889 · b042889
1 parent 600b7bf
commit b042889
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 121 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: CI
+name: End-to-end tests
 
 on:
   workflow_run:
@@ -22,14 +22,20 @@ on:
     branches:
       - main
 
+  # SHIVA
+  pull_request:
+    types:
+      - opened
+      - synchronize
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
 jobs:
   e2e-tests-nvidiadriver:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        driver:
-          - 535.183.06
-          - 550.90.07
 
     steps:
     - name: Check out code
@@ -41,7 +47,6 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
-        AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
       with:
         aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
         aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -59,11 +64,34 @@ jobs:
         echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
         echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
         echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
+        echo "DRIVER_VERSIONS=$(make -f versions.mk DRIVER_VERSIONS)" >> $GITHUB_ENV
         
     - name: Validate gpu driver
       env:
         TEST_CASE: "./tests/cases/nvidia-driver.sh"
       run: |
         sudo chmod 644 ${{ github.workspace }}/.cache/key
         echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
-        ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
+        rc=0
+        for driver_version in ${{ env.DRIVER_VERSIONS }}; do
+          echo "Running e2e for DRIVER_VERSION=$driver_version"
+          # ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version}
+          # SHIVA
+          ./tests/ci-run-e2e.sh ${TEST_CASE} ${driver_version}
+          status=$?
+          if [ $status -ne 0 ]; then
+            echo "e2e validation failed for driver version $driver_version with status $status"
+            rc=$status
+          fi
+        done
+        source ./tests/scripts/.definitions.sh
+        ./tests/scripts/pull.sh ${LOG_DIR} logs
+        exit $rc
+
+    - name: Archive test logs
+      if: ${{ failure() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: nvidiadriver-e2e-test-logs
+        path: ./logs/
+        retention-days: 15  
diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml
@@ -93,86 +93,89 @@ jobs:
           IMAGE_NAME: ghcr.io/nvidia/driver
           VERSION: ${COMMIT_SHORT_SHA}
         run: |
-          DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
+          # SHIVA
+          # DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
+          echo "SHIVA compeleted image building"
 
-  pre-compiled:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        driver: 
-          - 535
-          - 550
-        flavor:
-          - aws
-          - azure
-          - generic
-          - nvidia
-          - oracle
-        ispr:
-          - ${{github.event_name == 'pull_request'}}
-        exclude:
-          - ispr: true
-            flavor: azure
-          - ispr: true
-            flavor: aws
-          - ispr: true
-            flavor: nvidia
-          - ispr: true
-            flavor: oracle
-    steps:
-      - uses: actions/checkout@v4
-        name: Check out code
-      - name: Calculate build vars
-        id: vars
-        run: |
-          echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
-          echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
-          REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
-          echo "${REPO_FULL_NAME}"
-          echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
+# SHIVA
+  # pre-compiled:
+  #   runs-on: ubuntu-latest
+  #   strategy:
+  #     matrix:
+  #       driver: 
+  #         - 535
+  #         - 550
+  #       flavor:
+  #         - aws
+  #         - azure
+  #         - generic
+  #         - nvidia
+  #         - oracle
+  #       ispr:
+  #         - ${{github.event_name == 'pull_request'}}
+  #       exclude:
+  #         - ispr: true
+  #           flavor: azure
+  #         - ispr: true
+  #           flavor: aws
+  #         - ispr: true
+  #           flavor: nvidia
+  #         - ispr: true
+  #           flavor: oracle
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       name: Check out code
+  #     - name: Calculate build vars
+  #       id: vars
+  #       run: |
+  #         echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
+  #         echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
+  #         REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
+  #         echo "${REPO_FULL_NAME}"
+  #         echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
 
-          GENERATE_ARTIFACTS="false"
-          if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
-            GENERATE_ARTIFACTS="false"
-          elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
-            GENERATE_ARTIFACTS="true"
-          elif [[ "${{ github.event_name }}" == "push" ]]; then
-            GENERATE_ARTIFACTS="true"
-          fi
-          echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
-          echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
+  #         GENERATE_ARTIFACTS="false"
+  #         if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
+  #           GENERATE_ARTIFACTS="false"
+  #         elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
+  #           GENERATE_ARTIFACTS="true"
+  #         elif [[ "${{ github.event_name }}" == "push" ]]; then
+  #           GENERATE_ARTIFACTS="true"
+  #         fi
+  #         echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
+  #         echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
 
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Build base image and get kernel version
-        env:
-          IMAGE_NAME: ghcr.io/nvidia/driver
-          VERSION: ${COMMIT_SHORT_SHA}
-          BASE_TARGET: jammy
-        run: |
-          make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
+  #     - name: Set up QEMU
+  #       uses: docker/setup-qemu-action@v3
+  #     - name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     - name: Login to GitHub Container Registry
+  #       uses: docker/login-action@v3
+  #       with:
+  #         registry: ghcr.io
+  #         username: ${{ github.actor }}
+  #         password: ${{ secrets.GITHUB_TOKEN }}
+  #     - name: Build base image and get kernel version
+  #       env:
+  #         IMAGE_NAME: ghcr.io/nvidia/driver
+  #         VERSION: ${COMMIT_SHORT_SHA}
+  #         BASE_TARGET: jammy
+  #       run: |
+  #         make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
 
-          trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
-          docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} 
-          # try 3 times every 10 seconds to get the file, if success exit the loop
-          for i in {1..3}; do
-              docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
-              sleep 10
-          done
-      - name: Build image
-        env:
-          IMAGE_NAME: ghcr.io/nvidia/driver
-          VERSION: ${COMMIT_SHORT_SHA}
-          PRECOMPILED: "true"
-          DIST: signed_ubuntu22.04
-        run: |
-          source kernel_version.txt && \
-          make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
+  #         trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
+  #         docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} 
+  #         # try 3 times every 10 seconds to get the file, if success exit the loop
+  #         for i in {1..3}; do
+  #             docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
+  #             sleep 10
+  #         done
+  #     - name: Build image
+  #       env:
+  #         IMAGE_NAME: ghcr.io/nvidia/driver
+  #         VERSION: ${COMMIT_SHORT_SHA}
+  #         PRECOMPILED: "true"
+  #         DIST: signed_ubuntu22.04
+  #       run: |
+  #         source kernel_version.txt && \
+  #         make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh
@@ -19,3 +19,9 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
 : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}
 
 : ${TARGET_DRIVER_VERSION:="550.90.07"}
+
+: ${DAEMON_POD_STATUS_TIME_OUT:="10m"}
+: ${POD_STATUS_TIME_OUT:="2m"}
+: ${MAX_POD_STATUS_CHECK_TOTAL_TIME:="3000"} #50 minutes
+
+: ${LOG_DIR:="/tmp/logs"}
diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh
@@ -2,35 +2,17 @@
 
 check_pod_ready() {
 	local pod_label=$1
+	local pod_status_time_out=$2
 	local current_time=0
-	while :; do
-		echo "Checking $pod_label pod"
-		kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
+
+	echo "Checking $pod_label pod"
+
+	kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
 
-		echo "Checking $pod_label pod readiness"
-		is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
+	echo "Checking $pod_label pod readiness"
 
-		if [ "${is_pod_ready}" = "True" ]; then
-			# Check if the pod is not in terminating state
-			is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
-			if [ "${is_pod_terminating}" != "" ]; then
-				echo "pod $pod_label is in terminating state..."
-			else
-				echo "Pod $pod_label is ready"
-				break;
-			fi
-		fi
+	kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label--timeout ${POD_STATUS_TIME_OUT}
 
-		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
-			echo "timeout reached"
-			exit 1;
-		fi
-
-		# Echo useful information on stdout
-		kubectl get pods -n ${TEST_NAMESPACE}
-
-		echo "Sleeping 5 seconds"
-		current_time=$((${current_time} + 5))
-		sleep 5
-	done
+	# print status of pod
+	kubectl get pods -n ${TEST_NAMESPACE}
 }
diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh
@@ -7,7 +7,6 @@ echo ""
 echo ""
 echo "--------------Installing the GPU Operator--------------"
 
-# Install the operator with usePrecompiled mode set to true
 ${SCRIPT_DIR}/install-operator.sh
 
 "${SCRIPT_DIR}"/verify-operator.sh

diff --git a/tests/scripts/pull.sh b/tests/scripts/pull.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+if [[ $# -ne 2 ]]; then
+    echo "Pull requires a source and destination"
+    exit 1
+fi
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+source ${SCRIPT_DIR}/.definitions.sh
+source ${SCRIPT_DIR}/.local.sh
+
+${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2}
diff --git a/tests/scripts/verify-operator.sh b/tests/scripts/verify-operator.sh
@@ -11,9 +11,39 @@ source ${SCRIPT_DIR}/.definitions.sh
 # Import the check definitions
 source ${SCRIPT_DIR}/checks.sh
 
-check_pod_ready "nvidia-driver-daemonset"
-check_pod_ready "nvidia-container-toolkit-daemonset"
-check_pod_ready "nvidia-device-plugin-daemonset"
-check_pod_ready "nvidia-dcgm-exporter"
-check_pod_ready "gpu-feature-discovery"
-check_pod_ready "nvidia-operator-validator"
+# wait for the nvidia-driver pod to be ready
+# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy)
+# collect log in case of failure
+local start_time=$(date +%s)
+local log_dir=${LOG_DIR}
+while :; do
+    current_time=$(date +%s)
+    elapsed_time=$((current_time - start_time))
+
+    # Check if total elapsed time is greater than exit
+    if [ $elapsed_time -gt $MAX_POD_STATUS_CHECK_TOTAL_TIME ]; then
+        echo "Total wait time exceeded ${MAX_POD_STATUS_CHECK_TOTAL_TIME} seconds. Exiting..."
+        kubectl delete pods --all -n ${TEST_NAMESPACE}
+        exit 1
+    fi
+
+    check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \
+    check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}
+
+    not_ready_pod_status=$(kubectl get pods -n ${TEST_NAMESPACE} --field-selector=status.phase!=Running -o jsonpath='{.items[*].metadata.name}')
+    if [ -n "$not_ready_pod_status" ]; then
+        for pod in $not_ready_pod_status; do
+            echo "Collecting logs for pod: $pod"
+            echo "------------------------------------------------" >> "${log_dir}/${pod}.describe"
+			kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe"
+            kubectl logs $pod -n ${TEST_NAMESPACE} --all-containers=true >> "${log_dir}/${pod}_logs.txt" || true
+            echo "Logs saved to ${pod}_logs.txt"
+        done
+    else
+        echo "All gpu-operator pods are ready."
+        kubectl delete pods --all -n ${TEST_NAMESPACE}
+        break;
+    fi
+
+	sleep 10
+done