ci: Restore rosetta-t5x unit tests

NVIDIA · Apr 22, 2024 · 99db46d · 99db46d
1 parent 246f8b6
commit 99db46d
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 119 deletions.
diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml
diff --git a/.github/workflows/_test_rosetta_t5x.yaml b/.github/workflows/_test_rosetta_t5x.yaml
@@ -6,34 +6,33 @@ on:
       T5X_IMAGE:
         type: string
         description: T5X image from ghcr.io/nvidia/t5x
-        default: 'ghcr.io/nvidia/t5x:latest'
+        default: "ghcr.io/nvidia/t5x:latest"
         required: false
       BADGE_FILENAME:
         type: string
-        description: 'Name of the endpoint JSON file for shields.io badge'
+        description: "Name of the endpoint JSON file for shields.io badge"
         required: false
-        default: 'badge-rosetta-t5x-mgmn-test.json'
+        default: "badge-rosetta-t5x-mgmn-test.json"
       ARTIFACT_NAME:
         type: string
-        description: 'Name of the artifact zip file'
+        description: "Name of the artifact zip file"
         required: false
-        default: 'artifact-rosetta-t5x-mgmn-test'
+        default: "artifact-rosetta-t5x-mgmn-test"
       FW_NAME:
         type: string
-        description: 'Name of the framework being used'
+        description: "Name of the framework being used"
         required: false
-        default: 'rosetta-t5x'
+        default: "rosetta-t5x"
     outputs:
       TEST_STATUS:
-        description: 'Summary pass/fail value indicating if results from tests are acceptable'
+        description: "Summary pass/fail value indicating if results from tests are acceptable"
         value: ${{ jobs.sitrep.outputs.STATUS }}
 
 env:
   BATCH_SIZE_PER_GPU: 32
   VIT_BATCH_SIZE_PER_GPU: 256
 
 jobs:
-
   single-process-multi-device:
     strategy:
       matrix:
@@ -63,10 +62,10 @@ jobs:
         uses: webfactory/[email protected]
         with:
           ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          
+
       - name: Check out the repository under ${GITHUB_WORKSPACE}
         uses: actions/checkout@v4
-        
+
       - name: Setup SSH known hosts
         id: ssh-known-hosts
         run: |
@@ -182,7 +181,7 @@ jobs:
               dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
               json.dump(dump, f)
           EOF
-          
+
       - name: Generate sitrep
         if: success() || failure()
         shell: bash -x -e {0}
@@ -196,7 +195,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -402,7 +401,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -429,7 +428,7 @@ jobs:
           color="${badge_color}" \
           to_json schemaVersion label message color \
           > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
- 
+
       - name: Upload training logs as artifacts
         uses: actions/upload-artifact@v4
         with:
@@ -571,7 +570,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -744,7 +743,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -771,15 +770,21 @@ jobs:
           color="${badge_color}" \
           to_json schemaVersion label message color \
           > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
- 
+
       - name: Upload training logs as artifacts
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.meta.outputs.JOB_NAME }}
           path: output/*
 
   metrics:
-    needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
+    needs:
+      [
+        multi-gpu-multi-node,
+        single-process-multi-device,
+        vit-single-process-multi-device,
+        vit-multi-gpu-multi-node,
+      ]
     runs-on: ubuntu-22.04
 
     steps:
@@ -810,7 +815,7 @@ jobs:
           path: |
             report.jsonl
             *_metrics.json
-  
+
   sitrep:
     needs: metrics
     if: "!cancelled()"
@@ -820,10 +825,16 @@ jobs:
       BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
       ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
       FW_NAME: ${{ inputs.FW_NAME }}
-      
+
   summary:
     runs-on: ubuntu-22.04
-    needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
+    needs:
+      [
+        multi-gpu-multi-node,
+        single-process-multi-device,
+        vit-single-process-multi-device,
+        vit-multi-gpu-multi-node,
+      ]
     if: "!cancelled()"
     steps:
       - name: Generate TensorBoard query URL
@@ -848,3 +859,79 @@ jobs:
           if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
             exit 1
           fi
+
+  unit-tests:
+    runs-on: [self-hosted, V100]
+    env:
+      TEST_ARTIFACT_NAME: rosetta-test-logs
+      TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
+    steps:
+      - name: Print environment variables
+        run: |
+          env
+
+      - name: Print GPU information
+        run: nvidia-smi
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Rosetta image
+        shell: bash -x -e {0}
+        run: |
+          docker pull ${{ inputs.T5X_IMAGE }}
+          docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest
+
+      - name: Run Rosetta tests w/ docker
+        shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
+        run: |
+          ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
+          pip install "${ROSETTA_PATH}[test]" pytest-reportlog
+          pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true
+
+      - name: Upload unit test json logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.TEST_ARTIFACT_NAME }}
+          path: ${{ env.TEST_LOG_LOCAL_PATH }}
+
+  publish-test:
+    needs: unit-tests
+    uses: ./.github/workflows/_publish_badge.yaml
+    if: ( always() )
+    secrets: inherit
+    with:
+      ENDPOINT_FILENAME: "rosetta-unit-test-status.json"
+      PUBLISH: false
+      SCRIPT: |
+        ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
+        all_outcomes() {
+          cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
+        }
+        cnt_type() {
+          cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
+        }
+        SKIPPED_TESTS=$(cnt_type skipped)
+        FAILED_TESTS=$(cnt_type failed)
+        PASSED_TESTS=$(cnt_type passed)
+        TOTAL_TESTS=$(all_outcomes | wc -l)
+        echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
+        all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
+        if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
+          BADGE_COLOR=brightgreen
+          echo "STATUS=success" >> $GITHUB_OUTPUT
+        else
+          echo "STATUS=failure" >> $GITHUB_OUTPUT
+          if [[ $PASSED_TESTS -eq 0 ]]; then
+            BADGE_COLOR=red
+          else
+            BADGE_COLOR=yellow
+          fi
+        fi
+        echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
+        echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
+        echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT