From 333ebbc2a244faf00bd9a73c9d022a2dcef3cfed Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Thu, 22 Aug 2024 13:14:06 +0530 Subject: [PATCH] Pre-compiled end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 23 ++- .github/workflows/precompiled.yaml | 192 ++++++++++++++++++++-- tests/cases/nvidia-driver.sh | 8 + tests/ci-remote-exec.sh | 12 ++ tests/ci-run-e2e.sh | 8 +- tests/local.sh | 3 +- tests/remote-exec-local.sh | 21 +++ tests/scripts/.definitions.sh | 8 +- tests/scripts/.local.sh | 4 + tests/scripts/.rsync-excludes | 4 - tests/scripts/.rsync-includes | 2 + tests/scripts/end-to-end-nvidia-driver.sh | 2 +- tests/scripts/findkernelversion.sh | 32 ++++ tests/scripts/install-operator.sh | 12 +- tests/scripts/kernel-upgrade-helper.sh | 54 ++++++ tests/scripts/remote_retry.sh | 36 ++++ tests/scripts/sync.sh | 3 +- tests/scripts/upgrade-kernel.sh | 14 ++ 18 files changed, 394 insertions(+), 44 deletions(-) create mode 100755 tests/ci-remote-exec.sh create mode 100755 tests/remote-exec-local.sh delete mode 100644 tests/scripts/.rsync-excludes create mode 100644 tests/scripts/.rsync-includes create mode 100755 tests/scripts/findkernelversion.sh create mode 100755 tests/scripts/kernel-upgrade-helper.sh create mode 100755 tests/scripts/remote_retry.sh create mode 100755 tests/scripts/upgrade-kernel.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c6e8690..f1b340d4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -46,31 +46,36 @@ jobs: id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" + GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} rc=0 - for driver_version in ${DRIVER_VERSIONS}; do - echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + for DRIVER_VERSION in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" + status=0 + OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${COMMIT_SHORT_SHA}-${DRIVER_VERSION}" + # add escape character for space + OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS") + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$? if [ $status -ne 0 ]; then - echo "e2e validation failed for driver version $driver_version with status $status" + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" rc=$status fi done - source ./tests/scripts/.definitions.sh ./tests/scripts/pull.sh /tmp/logs logs exit $rc @@ -80,4 +85,4 @@ jobs: with: name: nvidiadriver-e2e-test-logs path: ./logs/ - retention-days: 15 + retention-days: 15 diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index 89152f82..54a1ad69 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -20,19 +20,34 @@ on: - cron: '00 09 * * *' # scheduled job jobs: - pre-compiled: + set-driver-version-matrix: + runs-on: ubuntu-latest + outputs: + driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }} + kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Read driver versions + id: extract_driver_branch + run: | + # get driver-branch + DRIVER_BRANCH=("535" "550") + driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .) + echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT + + # get kernel flavors + KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle") + kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .) + echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT + + precompiled-image: + needs: set-driver-version-matrix runs-on: ubuntu-latest strategy: matrix: - driver: - - 535 - - 550 - flavor: - - aws - - azure - - generic - - nvidia - - oracle + driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }} + flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }} steps: - uses: actions/checkout@v4 name: Check out code @@ -64,10 +79,10 @@ jobs: VERSION: ${COMMIT_SHORT_SHA} BASE_TARGET: jammy run: | - make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} + make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT - docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} + docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }} # try 3 times every 10 seconds to get the file, if success exit the loop for i in {1..3}; do docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break @@ -81,4 +96,155 @@ jobs: DIST: signed_ubuntu22.04 run: | source kernel_version.txt && \ - make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION} + make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION} + + determine-e2e-test-matrix: + runs-on: ubuntu-latest + needs: + - precompiled-image + - set-driver-version-matrix + outputs: + matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }} + matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }} + steps: + - name: Check out code + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set kernel version + id: set_kernel_version + env: + BASE_TARGET: "jammy" + DIST: "ubuntu22.04" + run: | + echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT + + kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}' + kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]') + driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' + driver_branch=$(echo "$driver_branch_json" | jq -r '.[]') + + kernel_versions=() + for kernel_flavor in $kernel_flavors; do + # FIXME -- remove if condition, once azure kernel upgrade starts working + if [[ "$kernel_flavor" == "azure" ]]; then + echo "skipping azure kernel testing" + continue + fi + for DRIVER_BRANCH in $driver_branch; do + source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" + if [[ "$should_continue" == true ]]; then + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT + break + fi + done + if [[ "$should_continue" == false ]]; then + echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}" + else + KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n') + kernel_versions+=("$KERNEL_VERSION") + echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}" + fi + done + + # Convert array to JSON format and assign + echo "[]" > $GITHUB_WORKSPACE/matrix_values.json + printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json + echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT + + e2e-tests-nvidiadriver: + runs-on: ubuntu-latest + needs: + - determine-e2e-test-matrix + - set-driver-version-matrix + if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }} + strategy: + matrix: + kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }} + steps: + - name: Check out code + uses: actions/checkout@v4 + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.1 + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + KERNEL_VERSION="${{ matrix.kernel_version }}" + echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + + - name: Upgrade the kernel for Precompiled e2e test + env: + UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh" + run: | + status=0 + ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$? + # On the target system, all scripts/test-case exit with code 1 for error handling. + # However, since reboot-related disconnections break the SSH connection + # and can cause the entire job to exit, we should ignore all errors except + # exit code 1. During a reboot, exit code 1 will not be thrown, so handling + # other errors as code 1 will ensure proper management of reboot scenarios + if [ $status -eq 1 ]; then + echo "Kernel version $KERNEL_VERSION upgrade failed" + exit 1 + fi + ./tests/scripts/remote_retry.sh || status=$? + if [ $status -ne 0 ]; then + echo "Failed to connect to remote instance" + exit $status + fi + + - name: Precompiled e2e test gpu driver validation + env: + TEST_CASE: "./tests/cases/nvidia-driver.sh" + GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + run: | + rc=0 + # for precompiled driver we are setting driver branch as driver version + driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' + driver_versions=$(echo "$driver_versions_json" | jq -r '.[]') + for DRIVER_VERSION in $driver_versions; do + echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" + status=0 + OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" + # add escape character for space + OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS") + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$? + if [ $status -eq 1 ]; then + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" + rc=$status + fi + done + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/tests/cases/nvidia-driver.sh b/tests/cases/nvidia-driver.sh index d2afad83..dcd9b509 100755 --- a/tests/cases/nvidia-driver.sh +++ b/tests/cases/nvidia-driver.sh @@ -1,6 +1,14 @@ #! /bin/bash # This test case runs the operator installation / test case with the default options. +if [[ $# -lt 1 ]]; then + echo "Error: $0 must be called with driver options" + exit 1 +fi + +# export gpu-operator options +export TEST_CASE_ARGS="$1" + SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" source "${SCRIPTS_DIR}"/.definitions.sh diff --git a/tests/ci-remote-exec.sh b/tests/ci-remote-exec.sh new file mode 100755 index 00000000..e9ed1b97 --- /dev/null +++ b/tests/ci-remote-exec.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -xe + +if [[ $# -lt 1 ]]; then + echo "Error:$0 must be called with 1(REMOTE_EXEC) or more than 1 args (REMOTE_EXEC, ARGS1 ARGS2 etc)" + exit 1 +fi + +TEST_DIR="$(pwd)/tests" + +${TEST_DIR}/remote-exec-local.sh "$@" diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..9a3b328a 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -3,14 +3,10 @@ set -xe if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" + echo "TEST_CASE TEST_CASE_ARGS are required" exit 1 fi -export TEST_CASE=${1} -export TARGET_DRIVER_VERSION=${2} - - TEST_DIR="$(pwd)/tests" -${TEST_DIR}/local.sh +${TEST_DIR}/local.sh "$@" diff --git a/tests/local.sh b/tests/local.sh index 86918588..a8acd3bf 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -23,5 +23,4 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. # are forwarded to the remote shell. remote \ PROJECT="${PROJECT}" \ - TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ - ${TEST_CASE} + "$@" diff --git a/tests/remote-exec-local.sh b/tests/remote-exec-local.sh new file mode 100755 index 00000000..20dcf163 --- /dev/null +++ b/tests/remote-exec-local.sh @@ -0,0 +1,21 @@ +#! /bin/bash + +if [[ $# -ge 1 ]]; then + REMOTE_EXEC=${1} + test -n "${REMOTE_EXEC}" +fi +test -f ${PROJECT_DIR}/${REMOTE_EXEC} + +export PROJECT="gpu-driver-container" + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +# Sync the project folder to the remote +${SCRIPT_DIR}/push.sh + +# We trigger the specified script on the remote instance. +remote \ + PROJECT="${PROJECT}" \ + "$@" diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..945bb04c 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -14,13 +14,13 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${TEST_NAMESPACE:="test-operator"} -: ${PRIVATE_REGISTRY:="ghcr.io"} - : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} -: ${TARGET_DRIVER_VERSION:="550.90.07"} - : ${DAEMON_POD_STATUS_TIME_OUT:="15m"} : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/.rsync-excludes b/tests/scripts/.rsync-excludes deleted file mode 100644 index 06c2f6ef..00000000 --- a/tests/scripts/.rsync-excludes +++ /dev/null @@ -1,4 +0,0 @@ -vendor/ -.git -cnt-ci -key.pem diff --git a/tests/scripts/.rsync-includes b/tests/scripts/.rsync-includes new file mode 100644 index 00000000..f91de959 --- /dev/null +++ b/tests/scripts/.rsync-includes @@ -0,0 +1,2 @@ +tests/ +tests/*** diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index d272efab..ab2db9a1 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh -echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" +echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------" ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..b0f12343 --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +if [[ $# -ne 4 ]]; then + echo " BASE_TARGET KERNEL_FLAVOR DRIVER_BRANCH DIST are required" + exit 1 +fi + +export BASE_TARGET="${1}" +export KERNEL_FLAVOR="${2}" +export DRIVER_BRANCH="${3}" +export DIST="${4}" + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} + +# calculate kernel version of latest image +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ./kernel_version.txt +export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) + +# calculate driver tag +status=0 +echo "regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$"" + +regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$? +if [[ $status -eq 0 ]]; then + export should_continue=false +else + export should_continue=true +fi diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..2b4bcbaf 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,10 +5,14 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +OPERATOR_OPTIONS="${TEST_CASE_ARGS}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update @@ -17,8 +21,8 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update kubectl create namespace "${TEST_NAMESPACE}" # Run the helm install command -echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" -${HELM} install gpu-operator nvidia/gpu-operator \ +echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}" +eval ${HELM} install gpu-operator nvidia/gpu-operator \ -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ + "${OPERATOR_OPTIONS}" \ --wait diff --git a/tests/scripts/kernel-upgrade-helper.sh b/tests/scripts/kernel-upgrade-helper.sh new file mode 100755 index 00000000..a8926397 --- /dev/null +++ b/tests/scripts/kernel-upgrade-helper.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +if [ "${CURRENT_KERNEL}" != ${KERNEL_VERSION} ]; then + echo "" + echo "" + echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------" + + # Set non-interactive frontend for apt and disable editor prompts + # Perform the installation non-interactively + export DEBIAN_FRONTEND=noninteractive + export EDITOR=/bin/true + echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections + + sudo apt-get update -y || true + + # The removal of the currently running kernel (apt remove linux-image-*) sometimes works and sometimes does not. + # Occasionally, it requires two reboots, or an apt upgrade. However, removing all traces of the old/current + # kernel from the boot directory works consistently, which is why this approach has been adopted. + sudo rm -rf /boot/*${CURRENT_KERNEL}* || true + sudo rm -rf /lib/modules/*${CURRENT_KERNEL}* + sudo rm -rf /boot/*.old + + #install new kernel + sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} linux-headers-${KERNEL_VERSION} linux-modules-${KERNEL_VERSION} -y || exit 1 + if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 + fi + echo "update grub and initramfs..." + sudo update-grub || true + sudo update-initramfs -u -k ${KERNEL_VERSION} || true + echo "Rebooting ..." + # Run the reboot command with nohup to avoid abrupt SSH closure issues + nohup sudo reboot & + + echo "--------------Kernel upgrade completed--------------" +else + echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------" +fi + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..e8e6038d --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +INTERVAL=30 +SECONDS_ELAPSED=0 + +set +e +# Function to handle timeout exit +handle_timeout() { + echo "Failed to connect within the timeout period of $SYSTEM_ONLINE_CHECK_TIMEOUT seconds." + exit 1 +} + +# Set trap for timeout +trap handle_timeout EXIT + +# sleep before to handle restart of the system +sleep 60; + +while [ $SECONDS_ELAPSED -lt $SYSTEM_ONLINE_CHECK_TIMEOUT ]; do + # Attempt to connect via SSH and ignore errors + status=0 + ( + ssh -o ConnectTimeout=5 -i ${private_key} ${instance_hostname} "exit" + ) >/dev/null 2>&1 + status=$? + if [ $status -eq 0 ]; then + echo "Successfully connected to ${instance_hostname}." + trap - EXIT # Disable the timeout trap since the connection was successful + exit 0 + fi + sleep $INTERVAL + SECONDS_ELAPSED=$((SECONDS_ELAPSED + INTERVAL)) + echo "ssh retry...elpased time $SECONDS_ELAPSED" +done diff --git a/tests/scripts/sync.sh b/tests/scripts/sync.sh index cb020752..555d7b86 100755 --- a/tests/scripts/sync.sh +++ b/tests/scripts/sync.sh @@ -12,6 +12,7 @@ source ${SCRIPT_DIR}/.local.sh rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ -avz --delete \ - --exclude-from="${SCRIPT_DIR}/.rsync-excludes" \ + --include-from="${SCRIPT_DIR}/.rsync-includes" \ + --exclude='*' \ ${@} diff --git a/tests/scripts/upgrade-kernel.sh b/tests/scripts/upgrade-kernel.sh new file mode 100755 index 00000000..0c575574 --- /dev/null +++ b/tests/scripts/upgrade-kernel.sh @@ -0,0 +1,14 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +if [[ $# -ne 1 ]]; then + echo "Error: $0 must be called with kernel_version" + exit 1 +fi + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# export kernel version and Run an end-to-end test cycle +export KERNEL_VERSION="$1" +"${SCRIPTS_DIR}"/kernel-upgrade-helper.sh