Skip to content

Commit

Permalink
end-to-end gpu driver testing enhancement
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 17, 2024
1 parent 600b7bf commit 13839ac
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 44 deletions.
33 changes: 24 additions & 9 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI
name: End-to-end tests

on:
workflow_run:
Expand All @@ -21,15 +21,10 @@ on:
- completed
branches:
- main

jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
Expand All @@ -41,7 +36,6 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -59,11 +53,32 @@ jobs:
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
5 changes: 5 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}
39 changes: 12 additions & 27 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,20 @@

check_pod_ready() {
local pod_label=$1
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
local pod_status_time_out=$2

echo "Checking $pod_label pod"

kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
echo "Checking $pod_label pod readiness"

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
break;
fi
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then
return 0
else
# print status of pod
kubectl get pods -n ${TEST_NAMESPACE}
fi

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
return 1
}
6 changes: 5 additions & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ echo ""
echo ""
echo "--------------Installing the GPU Operator--------------"

# Install the operator with usePrecompiled mode set to true
${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

echo "--------------Verification completed for GPU Operator--------------"
256 changes: 256 additions & 0 deletions tests/scripts/must-gather.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
#!/usr/bin/env bash

set -o nounset
set -x

K=kubectl
if ! $K version > /dev/null; then
K=oc

if ! $K version > /dev/null; then
echo "FATAL: neither 'kubectl' nor 'oc' appear to be working properly. Exiting ..."
exit 1
fi
fi

if [[ "$0" == "/usr/bin/gather" ]]; then
echo "Running as must-gather plugin image"
export ARTIFACT_DIR=/must-gather
else
if [ -z "${ARTIFACT_DIR:-}" ]; then
export ARTIFACT_DIR="/tmp/nvidia-gpu-operator_$(date +%Y%m%d_%H%M)"
fi
echo "Using ARTIFACT_DIR=$ARTIFACT_DIR"
fi

mkdir -p "$ARTIFACT_DIR"

echo

exec 1> >(tee $ARTIFACT_DIR/must-gather.log)
exec 2> $ARTIFACT_DIR/must-gather.stderr.log

if [[ "$0" == "/usr/bin/gather" ]]; then
echo "NVIDIA GPU Operator" > $ARTIFACT_DIR/version
echo "${VERSION:-N/A}" >> $ARTIFACT_DIR/version
fi

ocp_cluster=$($K get clusterversion/version --ignore-not-found -oname || true)

if [[ "$ocp_cluster" ]]; then
echo "Running in OpenShift."
echo "Get the cluster version"
$K get clusterversion/version -oyaml > $ARTIFACT_DIR/openshift_version.yaml
fi

echo "Get the operator namespaces"
OPERATOR_POD_NAME=$($K get pods -lapp=gpu-operator -oname -A)

if [ -z "$OPERATOR_POD_NAME" ]; then
echo "FATAL: could not find the GPU Operator Pod ..."
exit 1
fi

OPERATOR_NAMESPACE=$($K get pods -lapp=gpu-operator -A -ojsonpath={.items[].metadata.namespace} --ignore-not-found)

echo "Using '$OPERATOR_NAMESPACE' as operator namespace"
echo ""

echo "#"
echo "# ClusterPolicy"
echo "#"
echo

CLUSTER_POLICY_NAME=$($K get clusterpolicy -oname)

if [[ "$CLUSTER_POLICY_NAME" ]]; then
echo "Get $CLUSTER_POLICY_NAME"
$K get -oyaml $CLUSTER_POLICY_NAME > $ARTIFACT_DIR/cluster_policy.yaml
else
echo "Mark the ClusterPolicy as missing"
touch $ARTIFACT_DIR/cluster_policy.missing
fi

echo
echo "#"
echo "# Nodes and machines"
echo "#"
echo

if [ "$ocp_cluster" ]; then
echo "Get all the machines"
$K get machines -A > $ARTIFACT_DIR/all_machines.list
fi

echo "Get the labels of the nodes with NVIDIA PCI cards"

GPU_PCI_LABELS=(feature.node.kubernetes.io/pci-10de.present feature.node.kubernetes.io/pci-0302_10de.present feature.node.kubernetes.io/pci-0300_10de.present)

gpu_pci_nodes=""
for label in ${GPU_PCI_LABELS[@]}; do
gpu_pci_nodes="$gpu_pci_nodes $($K get nodes -l$label -oname)"
done

if [ -z "$gpu_pci_nodes" ]; then
echo "FATAL: could not find nodes with NVIDIA PCI labels"
exit 0
fi

for node in $(echo "$gpu_pci_nodes"); do
echo "$node" | cut -d/ -f2 >> $ARTIFACT_DIR/gpu_nodes.labels
$K get $node '-ojsonpath={.metadata.labels}' \
| sed 's|,|,- |g' \
| tr ',' '\n' \
| sed 's/{"/- /' \
| tr : = \
| sed 's/"//g' \
| sed 's/}/\n/' \
>> $ARTIFACT_DIR/gpu_nodes.labels
echo "" >> $ARTIFACT_DIR/gpu_nodes.labels
done

echo "Get the GPU nodes (status)"
$K get nodes -l nvidia.com/gpu.present=true -o wide > $ARTIFACT_DIR/gpu_nodes.status

echo "Get the GPU nodes (description)"
$K describe nodes -l nvidia.com/gpu.present=true > $ARTIFACT_DIR/gpu_nodes.descr

echo ""
echo "#"
echo "# Operator Pod"
echo "#"
echo

echo "Get the GPU Operator Pod (status)"
$K get $OPERATOR_POD_NAME \
-owide \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operator_pod.status

echo "Get the GPU Operator Pod (yaml)"
$K get $OPERATOR_POD_NAME \
-oyaml \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operator_pod.yaml

echo "Get the GPU Operator Pod logs"
$K logs $OPERATOR_POD_NAME \
-n $OPERATOR_NAMESPACE \
> "$ARTIFACT_DIR/gpu_operator_pod.log"

$K logs $OPERATOR_POD_NAME \
-n $OPERATOR_NAMESPACE \
--previous \
> "$ARTIFACT_DIR/gpu_operator_pod.previous.log"

echo ""
echo "#"
echo "# Operand Pods"
echo "#"
echo ""

echo "Get the Pods in $OPERATOR_NAMESPACE (status)"
$K get pods -owide \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_pods.status

echo "Get the Pods in $OPERATOR_NAMESPACE (yaml)"
$K get pods -oyaml \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_pods.yaml

echo "Get the GPU Operator Pods Images"
$K get pods -n $OPERATOR_NAMESPACE \
-o=jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}' \
> $ARTIFACT_DIR/gpu_operand_pod_images.txt

echo "Get the description and logs of the GPU Operator Pods"

for pod in $($K get pods -n $OPERATOR_NAMESPACE -oname);
do
if ! $K get $pod -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then
echo "Skipping $pod, not a NVIDA/GPU Pod ..."
continue
fi
pod_name=$(echo "$pod" | cut -d/ -f2)

if [ $pod == $OPERATOR_POD_NAME ]; then
echo "Skipping operator pod $pod_name ..."
continue
fi

$K logs $pod \
-n $OPERATOR_NAMESPACE \
--all-containers --prefix \
> $ARTIFACT_DIR/gpu_operand_pod_$pod_name.log

$K logs $pod \
-n $OPERATOR_NAMESPACE \
--all-containers --prefix \
--previous \
> $ARTIFACT_DIR/gpu_operand_pod_$pod_name.previous.log

$K describe $pod \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_pod_$pod_name.descr
done

echo ""
echo "#"
echo "# Operand DaemonSets"
echo "#"
echo ""

echo "Get the DaemonSets in $OPERATOR_NAMESPACE (status)"

$K get ds \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_ds.status


echo "Get the DaemonSets in $OPERATOR_NAMESPACE (yaml)"

$K get ds -oyaml \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_ds.yaml

echo "Get the description of the GPU Operator DaemonSets"

for ds in $($K get ds -n $OPERATOR_NAMESPACE -oname);
do
if ! $K get $ds -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then
echo "Skipping $ds, not a NVIDA/GPU DaemonSet ..."
continue
fi
$K describe $ds \
-n $OPERATOR_NAMESPACE \
> $ARTIFACT_DIR/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr
done

echo ""
echo "#"
echo "# nvidia-bug-report.sh"
echo "#"
echo ""

for pod in $($K get pods -lopenshift.driver-toolkit -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-driver-daemonset -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n $OPERATOR_NAMESPACE);
do
pod_nodename=$($K get $pod -ojsonpath={.spec.nodeName} -n $OPERATOR_NAMESPACE)
echo "Saving nvidia-bug-report from ${pod_nodename} ..."

$K exec -n $OPERATOR_NAMESPACE $pod -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \
(echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue)

$K cp $OPERATOR_NAMESPACE/$(basename $pod):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
(echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue)

mv /tmp/nvidia-bug-report.log.gz $ARTIFACT_DIR/nvidia-bug-report_${pod_nodename}.log.gz
done

echo ""
echo "#"
echo "# All done!"
if [[ "$0" != "/usr/bin/gather" ]]; then
echo "# Logs saved into ${ARTIFACT_DIR}."
fi
echo "#"
Loading

0 comments on commit 13839ac

Please sign in to comment.