Skip to content

Commit

Permalink
test: update CI setup for running GPU unit tests (#10230)
Browse files Browse the repository at this point in the history
  • Loading branch information
jgongd authored Nov 22, 2024
1 parent 81b2fce commit 821e8a5
Showing 1 changed file with 61 additions and 16 deletions.
77 changes: 61 additions & 16 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ parameters:
gpu-machine-image:
type: string
default: linux-cuda-12:default
gpu-machine-resource-class:
type: string
default: gpu.nvidia.small.multi
# DEFAULT_PT_GPU_IMAGE: Pytorch training image reference used by the tests
default-pt-gpu-hpc-image:
type: string
Expand Down Expand Up @@ -405,6 +408,10 @@ commands:
python-version:
type: string
default: "3.8.18"
install-nvidia-apex:
description: "Install dependency for some of the GPU tests."
type: boolean
default: false
steps:
- run:
name: Write cache key
Expand All @@ -422,6 +429,7 @@ commands:
fi
echo '<<parameters.python-version>>' >> /tmp/cachefile
echo '<<parameters.install-python>>' >> /tmp/cachefile
echo '<<parameters.install-nvidia-apex>>' >> /tmp/cachefile
date -u '+%y/%m/%d' >> /tmp/cachefile
- restore_cache:
Expand Down Expand Up @@ -504,6 +512,21 @@ commands:
tools/scripts/retry.sh pip install -r $i
done
fi
- when:
condition: <<parameters.install-nvidia-apex>>
steps:
- run:
name: Install Nvidia Apex
description: "Apex installation needs Cuda 12.1 because Pytorch binaries were compiled by Cuda 12.1."
command: |
if ! pip show apex; then
wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
sudo sh cuda_12.1.0_530.30.02_linux.run --toolkit --silent
nvcc --version
git clone https://github.com/NVIDIA/apex ~/apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ~/apex
fi
- save_cache:
key: det-python-deps-<<pipeline.parameters.cache-buster>>-{{ checksum "/tmp/cachefile" }}
paths:
Expand Down Expand Up @@ -2373,9 +2396,9 @@ jobs:
path: /tmp/test-results

test-unit-harness-gpu-tf:
docker:
- image: determinedai/tensorflow-ngc-dev:0736b6d
resource_class: determined-ai/container-runner-gpu
machine:
image: <<pipeline.parameters.gpu-machine-image>>
resource_class: <<pipeline.parameters.gpu-machine-resource-class>>
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
- checkout
Expand All @@ -2386,7 +2409,12 @@ jobs:
- run: pip install mypy pytest coverage
- install-codecov
- setup-paths
- run: COVERAGE_FILE=/root/project/test-unit-harness-gpu-tf-pycov make -C harness test-gpu-tf
- setup-python-venv:
install-python: true
determined: true
extra-requirements-file: "harness/tests/requirements/requirements-harness.txt"
executor: machine-<<pipeline.parameters.gpu-machine-resource-class>>
- run: COVERAGE_FILE=/home/circleci/project/test-unit-harness-gpu-tf-pycov make -C harness test-gpu-tf
- run: coverage xml -i --data-file=./test-unit-harness-gpu-tf-pycov
- run: codecov -v -t $CODECOV_TOKEN -F harness
- persist_to_workspace:
Expand All @@ -2397,9 +2425,9 @@ jobs:
path: /tmp/test-results

test-unit-harness-pytorch2-gpu:
docker:
- image: determinedai/pytorch-ngc-dev:0736b6d
resource_class: determined-ai/container-runner-gpu
machine:
image: <<pipeline.parameters.gpu-machine-image>>
resource_class: <<pipeline.parameters.gpu-machine-resource-class>>
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
- checkout
Expand All @@ -2410,7 +2438,13 @@ jobs:
- run: pip install mypy pytest coverage
- install-codecov
- setup-paths
- run: COVERAGE_FILE=/root/project/test-unit-harness-pytorch2-gpu-pycov make -C harness test-pytorch-gpu
- setup-python-venv:
install-python: true
determined: true
extra-requirements-file: "harness/tests/requirements/requirements-harness.txt"
executor: machine-<<pipeline.parameters.gpu-machine-resource-class>>
install-nvidia-apex: true
- run: COVERAGE_FILE=/home/circleci/project/test-unit-harness-pytorch2-gpu-pycov make -C harness test-pytorch-gpu
- run: coverage xml -i --data-file=./test-unit-harness-pytorch2-gpu-pycov
- run: codecov -v -t $CODECOV_TOKEN -F harness
- persist_to_workspace:
Expand Down Expand Up @@ -2444,9 +2478,9 @@ jobs:
path: /tmp/test-results

test-unit-harness-gpu-parallel:
docker:
- image: determinedai/pytorch-ngc-dev:0736b6d
resource_class: determined-ai/container-runner-multi-gpu
machine:
image: <<pipeline.parameters.gpu-machine-image>>
resource_class: <<pipeline.parameters.gpu-machine-resource-class>>
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
- checkout
Expand All @@ -2457,7 +2491,13 @@ jobs:
- run: pip install mypy pytest coverage
- install-codecov
- setup-paths
- run: COVERAGE_FILE=/root/project/test-unit-harness-gpu-parallel-pycov make -C harness test-gpu-parallel
- setup-python-venv:
install-python: true
determined: true
extra-requirements-file: "harness/tests/requirements/requirements-harness.txt"
executor: machine-<<pipeline.parameters.gpu-machine-resource-class>>
install-nvidia-apex: true
- run: COVERAGE_FILE=/home/circleci/project/test-unit-harness-gpu-parallel-pycov make -C harness test-gpu-parallel
- run: coverage xml -i --data-file=./test-unit-harness-gpu-parallel-pycov
- run: codecov -v -t $CODECOV_TOKEN -F harness
- persist_to_workspace:
Expand All @@ -2468,9 +2508,9 @@ jobs:
path: /tmp/test-results

test-unit-harness-gpu-deepspeed:
docker:
- image: determinedai/pytorch-ngc-dev:0736b6d
resource_class: determined-ai/container-runner-gpu
machine:
image: <<pipeline.parameters.gpu-machine-image>>
resource_class: <<pipeline.parameters.gpu-machine-resource-class>>
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
- checkout
Expand All @@ -2481,7 +2521,12 @@ jobs:
- run: pip install mypy pytest coverage
- install-codecov
- setup-paths
- run: COVERAGE_FILE=/root/project/test-unit-harness-gpu-deepspeed-pycov make -C harness test-gpu-deepspeed
- setup-python-venv:
install-python: true
determined: true
extra-requirements-file: "harness/tests/requirements/requirements-harness.txt"
executor: machine-<<pipeline.parameters.gpu-machine-resource-class>>
- run: COVERAGE_FILE=/home/circleci/project/test-unit-harness-gpu-deepspeed-pycov make -C harness test-gpu-deepspeed
- run: coverage xml -i --data-file=./test-unit-harness-gpu-deepspeed-pycov
- run: codecov -v -t $CODECOV_TOKEN -F harness
- persist_to_workspace:
Expand Down

0 comments on commit 821e8a5

Please sign in to comment.