Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

end-to-end gpu driver testing enhancement #88

Merged
merged 1 commit into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI
name: End-to-end tests

on:
workflow_run:
Expand All @@ -25,11 +25,6 @@ on:
jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
Expand All @@ -41,7 +36,6 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -59,11 +53,32 @@ jobs:
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
5 changes: 5 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}
39 changes: 12 additions & 27 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,20 @@

check_pod_ready() {
local pod_label=$1
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
local pod_status_time_out=$2

echo "Checking $pod_label pod"

kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
echo "Checking $pod_label pod readiness"

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
break;
fi
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then
return 0
else
# print status of pod
kubectl get pods -n ${TEST_NAMESPACE}
fi

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
return 1
}
6 changes: 5 additions & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ echo ""
echo ""
echo "--------------Installing the GPU Operator--------------"

# Install the operator with usePrecompiled mode set to true
${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

echo "--------------Verification completed for GPU Operator--------------"
12 changes: 12 additions & 0 deletions tests/scripts/pull.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

if [[ $# -ne 2 ]]; then
echo "Pull requires a source and destination"
exit 1
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2}
3 changes: 2 additions & 1 deletion tests/scripts/remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

ssh -i ${private_key} ${instance_hostname} "${@}"
# keep alive 60sec and timeout after 30 tries
ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}"
shivakunv marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 14 additions & 0 deletions tests/scripts/uninstall-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

namespace=$1
release_name=$2
helm uninstall $release_name --namespace $namespace || true
kubectl delete namespace $namespace || true
20 changes: 14 additions & 6 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ source ${SCRIPT_DIR}/.definitions.sh
# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

check_pod_ready "nvidia-driver-daemonset"
check_pod_ready "nvidia-container-toolkit-daemonset"
check_pod_ready "nvidia-device-plugin-daemonset"
check_pod_ready "nvidia-dcgm-exporter"
check_pod_ready "gpu-feature-discovery"
check_pod_ready "nvidia-operator-validator"
# wait for the nvidia-driver pod to be ready
# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy)
# collect log in case of failure
check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \
check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}; exit_status=$?
if [ $exit_status -ne 0 ]; then
curl -o ${SCRIPT_DIR}/must-gather.sh "https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh"
chmod +x ${SCRIPT_DIR}/must-gather.sh
ARTIFACT_DIR="${LOG_DIR}" ${SCRIPT_DIR}/must-gather.sh
${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"
exit 1
else
echo "All gpu-operator pods are ready."
fi