From 25ca69e8f19aab0ea954d78dad78fff186bd2eef Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Wed, 17 Apr 2024 20:49:22 +0200 Subject: [PATCH 01/11] refactor: `build-maxtext` to `build-upstream-maxtext` --- .github/workflows/_ci.yaml | 25 ++++++++++++------------- README.md | 10 +++++----- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 589a42d3b..cd6f5bd8a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -14,7 +14,7 @@ on: MANIFEST_ARTIFACT_NAME: type: string description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch - default: '' + default: "" required: false outputs: DOCKER_TAGS: @@ -22,12 +22,11 @@ on: value: ${{ jobs.collect-docker-tags.outputs.TAGS }} permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows + contents: read # to fetch code + actions: write # to cancel previous workflows packages: write # to upload container jobs: - build-base: uses: ./.github/workflows/_build_base.yaml with: @@ -77,7 +76,7 @@ jobs: DOCKERFILE: .github/container/Dockerfile.equinox secrets: inherit - build-maxtext: + build-upstream-maxtext: needs: build-jax if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 uses: ./.github/workflows/_build.yaml @@ -162,7 +161,7 @@ jobs: CONTAINER_NAME: grok DOCKERFILE: .github/container/Dockerfile.grok secrets: inherit - + collect-docker-tags: runs-on: ubuntu-22.04 if: "!cancelled()" @@ -171,7 +170,7 @@ jobs: - build-jax - build-triton - build-equinox - - build-maxtext + - build-upstream-maxtext - build-levanter - build-upstream-t5x - build-upstream-pax @@ -190,7 +189,7 @@ jobs: {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ @@ -200,7 +199,7 @@ jobs: {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ @@ -369,7 +368,7 @@ jobs: test-levanter: needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: levanter @@ -396,7 +395,7 @@ jobs: test-te: needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: te @@ -439,9 +438,9 @@ jobs: secrets: inherit test-maxtext: - needs: build-maxtext + needs: build-upstream-maxtext if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 uses: ./.github/workflows/_test_maxtext.yaml with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit diff --git a/README.md b/README.md index e8d598626..5108515c3 100644 --- a/README.md +++ b/README.md @@ -215,19 +215,19 @@ - + - ghcr.io/nvidia/jax:maxtext + ghcr.io/nvidia/jax:upstream-maxtext - - + - + From 1da45d29bc79140ac30f616ede56e8f5a081e7a7 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Thu, 18 Apr 2024 14:59:02 +0200 Subject: [PATCH 02/11] chore: Add `.gitignore` --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..51b4cfda1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +local/ From 5acefb49be994b68a668640786a6b4d5d25bfff0 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Thu, 18 Apr 2024 21:19:38 +0200 Subject: [PATCH 03/11] feat: Add rosetta-maxtext Dockerfile --- rosetta/Dockerfile.maxtext | 77 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 rosetta/Dockerfile.maxtext diff --git a/rosetta/Dockerfile.maxtext b/rosetta/Dockerfile.maxtext new file mode 100644 index 000000000..936fbefa1 --- /dev/null +++ b/rosetta/Dockerfile.maxtext @@ -0,0 +1,77 @@ +# syntax=docker/dockerfile:1-labs +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-pax +ARG GIT_USER_EMAIL=jax@nvidia.com +ARG GIT_USER_NAME=NVIDIA +# If set to "true", then will pull new local patches, the manifest.yaml and create-distribution.sh (in case it was updated). +# This is useful for development if you run `./bump.sh -i manifest.yaml` manually and do not want to trigger a full rebuild all +# the way up to the jax build. +ARG UPDATE_PATCHES=false +# It is common for TE developers to test a different TE against the LLM application. This is a knob to override what's in the manifest +# Accepts git-ref's from NVIDIA/TransformerEngine or pull requests (pull/$number/head) +ARG UPDATED_TE_REF="" + +# Rosetta and optionally patches are pulled from this +FROM scratch AS jax-toolbox + +############################################################################### +### Download source and add auxiliary scripts +################################################################################ + +FROM ${BASE_IMAGE} AS mealkit +ARG GIT_USER_EMAIL +ARG GIT_USER_NAME +ARG UPDATE_PATCHES +ARG UPDATED_TE_REF + +ENV ENABLE_TE=1 + +RUN --mount=target=/mnt/jax-toolbox,from=jax-toolbox <<"EOF" bash -exu +MANIFEST_DIR=$(dirname ${MANIFEST_FILE}) +if [[ "${UPDATE_PATCHES}" != "true" && "${UPDATE_PATCHES}" != "false" ]]; then + echo "UPDATE_PATCHES can only be true or false" + exit 1 +fi +if [[ "${UPDATE_PATCHES}" == "true" ]]; then + cp -r /mnt/jax-toolbox/.github/container/patches ${MANIFEST_DIR}/ + cp /mnt/jax-toolbox/.github/container/manifest.yaml ${MANIFEST_DIR}/manifest.yaml + cp /mnt/jax-toolbox/.github/container/create-distribution.sh ${MANIFEST_DIR}/create-distribution.sh + # TODO: remove + cp /mnt/jax-toolbox/.github/container/pip-finalize.sh /usr/local/bin/ +fi +cp -r /mnt/jax-toolbox/rosetta /opt/rosetta + +if [[ -n "${UPDATED_TE_REF}" ]]; then + TE_INSTALL_DIR=/opt/transformer-engine + yq e ".transformer-engine.latest_verified_commit = \"${UPDATED_TE_REF}\"" -i $MANIFEST_FILE + # Install from source instead of pre-built wheel + sed -i -E 's@( file:///opt/transformer-engine)/dist/[^ ]*@\1@' /opt/pip-tools.d/requirements-te.in + git -C $TE_INSTALL_DIR fetch -a + if [[ "${UPDATED_TE_REF}" =~ ^pull/ ]]; then + PR_ID=$(cut -d/ -f2 <<<"${UPDATED_TE_REF}") + git -C $TE_INSTALL_DIR fetch origin ${UPDATED_TE_REF}:PR-${PR_ID} + git -C $TE_INSTALL_DIR checkout PR-${PR_ID} + else + git -C $TE_INSTALL_DIR checkout ${UPDATED_TE_REF} + fi +fi + +# Setting the username/email is required to author commits from patches +git config --global user.email "${GIT_USER_EMAIL}" +git config --global user.name "${GIT_USER_NAME}" + +bash ${MANIFEST_DIR}/create-distribution.sh \ + --manifest ${MANIFEST_FILE} \ + --package maxtext +# Remove .gitconfig to avoid end-user authoring commits as the "build user" +rm -f ~/.gitconfig +EOF + +WORKDIR /opt/rosetta + +############################################################################### +### Install accumulated packages from the base image and the previous stage +################################################################################ + +FROM mealkit as final + +RUN pip-finalize.sh From 4fc41f3bba758962837193fe4289098651bbf534 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 19 Apr 2024 13:11:37 +0200 Subject: [PATCH 04/11] ci: Add `rosetta-maxtext` build --- .github/workflows/_ci.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index cd6f5bd8a..8c7ef1f4f 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -90,6 +90,16 @@ jobs: DOCKERFILE: .github/container/Dockerfile.maxtext.amd64 secrets: inherit + build-rosetta-maxtext: + needs: build-upstream-maxtext + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: maxtext + secrets: inherit + build-levanter: needs: [build-jax] uses: ./.github/workflows/_build.yaml From b8699ca47cf12f294e2c899c048867ec90405e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 20 Apr 2024 21:36:23 +0200 Subject: [PATCH 05/11] Update rosetta/Dockerfile.maxtext Co-authored-by: Terry Kong --- .github/workflows/_ci.yaml | 4 +++- rosetta/Dockerfile.maxtext | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 8c7ef1f4f..1295f5bc6 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -86,7 +86,7 @@ jobs: BADGE_FILENAME: badge-maxtext-build BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext + CONTAINER_NAME: upstream-maxtext DOCKERFILE: .github/container/Dockerfile.maxtext.amd64 secrets: inherit @@ -184,8 +184,10 @@ jobs: - build-levanter - build-upstream-t5x - build-upstream-pax + - build-upstream-maxtext - build-rosetta-t5x - build-rosetta-pax + - build-rosetta-maxtext - build-grok outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} diff --git a/rosetta/Dockerfile.maxtext b/rosetta/Dockerfile.maxtext index 936fbefa1..6c69ed93a 100644 --- a/rosetta/Dockerfile.maxtext +++ b/rosetta/Dockerfile.maxtext @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-pax +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-maxtext ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA # If set to "true", then will pull new local patches, the manifest.yaml and create-distribution.sh (in case it was updated). @@ -35,8 +35,6 @@ if [[ "${UPDATE_PATCHES}" == "true" ]]; then cp -r /mnt/jax-toolbox/.github/container/patches ${MANIFEST_DIR}/ cp /mnt/jax-toolbox/.github/container/manifest.yaml ${MANIFEST_DIR}/manifest.yaml cp /mnt/jax-toolbox/.github/container/create-distribution.sh ${MANIFEST_DIR}/create-distribution.sh - # TODO: remove - cp /mnt/jax-toolbox/.github/container/pip-finalize.sh /usr/local/bin/ fi cp -r /mnt/jax-toolbox/rosetta /opt/rosetta From c2525cfbf462658e56bb5ed659ca0f29612d64a5 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 14:26:57 +0200 Subject: [PATCH 06/11] ci: Publish maxtext upstream & rosetta images --- .github/workflows/_ci.yaml | 6 ++++-- .github/workflows/ci.yaml | 31 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 1295f5bc6..522c51474 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -201,20 +201,22 @@ jobs: {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "grok", "stage": "final", "priority": 900, "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "grok", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_MEALKIT }}"},\ diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0098b83bf..f93a6535b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,7 +2,7 @@ name: CI on: schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC + - cron: "30 9 * * *" # Pacific Time 01:30 AM in UTC pull_request: types: - opened @@ -10,7 +10,7 @@ on: - ready_for_review - synchronize paths-ignore: - - '**.md' + - "**.md" workflow_dispatch: inputs: PUBLISH: @@ -34,16 +34,15 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: - contents: write # to fetch code and push branch - actions: write # to cancel previous workflows - packages: write # to upload container - pull-requests: write # to make pull request for manifest bump + contents: write # to fetch code and push branch + actions: write # to cancel previous workflows + packages: write # to upload container + pull-requests: write # to make pull request for manifest bump env: DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest jobs: - metadata: runs-on: ubuntu-22.04 outputs: @@ -115,7 +114,7 @@ jobs: shell: bash -x -e {0} run: | bash bump.sh --input-manifest manifest.yaml --output-manifest manifest.yaml.new --base-patch-dir ./patches-new - + - name: Maybe replace current manifest/patches with the new one and show diff working-directory: .github/container shell: bash -x -e {0} @@ -168,12 +167,11 @@ jobs: steps: - name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" id: test_result - run: - echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT + run: echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - + - name: Delete checked-out manifest and patches run: | rm .github/container/manifest.yaml @@ -213,7 +211,7 @@ jobs: git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }} # Push the new change git push origin ${{ github.ref_name }} - + # We will create a Draft PR & remote branch if: # 1. The tests failed # 2. The merge failed @@ -244,12 +242,12 @@ jobs: draft: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - + - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}" if: steps.create_pr.outcome == 'success' run: | echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY - + # Guard delete in simple check to protect other branches - name: Check that the branch matches znightly- prefix run: | @@ -271,7 +269,7 @@ jobs: make-publish-configs: runs-on: ubuntu-22.04 - if: ${{ !cancelled() }} + if: ${{ !cancelled() }} env: MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }} FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }} @@ -294,6 +292,7 @@ jobs: levanter upstream-t5x upstream-pax + upstream-maxtext t5x pax grok @@ -365,7 +364,7 @@ jobs: needs: - metadata - make-publish-configs - if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} + if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} strategy: fail-fast: false matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }} From 2e27a1796390e7c2cb4bee97d9a1e421adae98c6 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 15:17:38 +0200 Subject: [PATCH 07/11] ci: Refactor maxtest tests To comply with naming convention --- .github/workflows/_ci.yaml | 4 ++-- .../{_test_maxtext.yaml => _test_upstream_maxtext.yaml} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename .github/workflows/{_test_maxtext.yaml => _test_upstream_maxtext.yaml} (100%) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 522c51474..c28f944c6 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -451,10 +451,10 @@ jobs: PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-maxtext: + test-upstream-maxtext: needs: build-upstream-maxtext if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_maxtext.yaml + uses: ./.github/workflows/_test_upstream_maxtext.yaml with: MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_upstream_maxtext.yaml similarity index 100% rename from .github/workflows/_test_maxtext.yaml rename to .github/workflows/_test_upstream_maxtext.yaml From ecc56287e9ad9cb997052acd698027ce4436dc7a Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 21:17:18 +0200 Subject: [PATCH 08/11] docs: Update README --- README.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5108515c3..fb4ae772d 100644 --- a/README.md +++ b/README.md @@ -222,12 +222,31 @@ ghcr.io/nvidia/jax:upstream-maxtext - + + - + + + + + + + + + + + + ghcr.io/nvidia/jax:maxtext + + + + + + + + From 2998718e604afb14c345026ccd93140d8814475a Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 21:18:09 +0200 Subject: [PATCH 09/11] ci: Fix name of badge for upstream-maxtext --- .github/workflows/_test_upstream_maxtext.yaml | 25 +++++++++---------- README.md | 6 ++--- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/workflows/_test_upstream_maxtext.yaml b/.github/workflows/_test_upstream_maxtext.yaml index 61bee91d6..77589d479 100644 --- a/.github/workflows/_test_upstream_maxtext.yaml +++ b/.github/workflows/_test_upstream_maxtext.yaml @@ -11,13 +11,13 @@ on: EXTRA_TEST_ARGS: type: string description: Extra command line args to pass to test-maxtext.sh - default: "" + default: '' required: false BADGE_FILENAME: type: string description: 'Name of the endpoint JSON file for shields.io badge' required: false - default: 'badge-maxtext-test.json' + default: 'badge-upstream-maxtext-test.json' ARTIFACT_NAME: type: string description: 'Name of the artifact zip file' @@ -34,12 +34,11 @@ on: value: ${{ jobs.sitrep.outputs.STATUS }} jobs: - single-process-multi-device: strategy: matrix: PARALLEL_CONFIG: - - [1, 1, 2, 4] + - [1, 1, 2, 4] # - [1, 1, 1, 8] # PP, DP, FSDP, TP fail-fast: false @@ -183,12 +182,12 @@ jobs: strategy: matrix: PARALLEL_CONFIG: - - [1, 1, 1, 1] - - [1, 1, 8, 1] - - [1, 1, 1, 8] - - [1, 1, 4, 2] - - [1, 2, 2, 2] - - [1, 4, 2, 2] + - [1, 1, 1, 1] + - [1, 1, 8, 1] + - [1, 1, 1, 8] + - [1, 1, 4, 2] + - [1, 2, 2, 2] + - [1, 4, 2, 2] fail-fast: false runs-on: ubuntu-22.04 @@ -366,7 +365,7 @@ jobs: sitrep: needs: [single-process-multi-device, maxtext-multinode, metrics] - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_sitrep_mgmn.yaml secrets: inherit with: @@ -377,7 +376,7 @@ jobs: summary: runs-on: ubuntu-22.04 needs: [single-process-multi-device, maxtext-multinode] - if: "!cancelled()" + if: '!cancelled()' steps: - name: Generate TensorBoard query URL run: | @@ -394,7 +393,7 @@ jobs: outcome: needs: sitrep runs-on: ubuntu-22.04 - if: "!cancelled()" + if: '!cancelled()' steps: - name: Sets workflow status based on test outputs run: | diff --git a/README.md b/README.md index fb4ae772d..fe7a960ca 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ ghcr.io/nvidia/jax:upstream-maxtext - + @@ -241,8 +241,8 @@ ghcr.io/nvidia/jax:maxtext - - + + From a4564f7a9c037d6f6e2c6d8b16462b601b8bded9 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 26 Apr 2024 15:26:50 +0200 Subject: [PATCH 10/11] test --- .github/container/manifest.yaml | 3 + .github/workflows/_ci.yaml | 408 ++++++++++++++++---------------- 2 files changed, 207 insertions(+), 204 deletions(-) diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 712b13b55..0aede0826 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -113,9 +113,12 @@ jax-triton: mode: git-clone maxtext: url: https://github.com/google/maxtext.git + mirror_url: https://github.com/nvjax-svc-0/maxtext.git tracking_ref: main latest_verified_commit: 78daad198544def8274dbd656d122fbe6a0e1129 mode: git-clone + patches: + mirror/patch/test_rosetta_maxtext: file://patches/maxtext/mirror-patch-rosetta-maxtext.patch levanter: url: https://github.com/stanford-crfm/levanter.git tracking_ref: main diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index c28f944c6..203a58163 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -14,7 +14,7 @@ on: MANIFEST_ARTIFACT_NAME: type: string description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch - default: "" + default: '' required: false outputs: DOCKER_TAGS: @@ -105,8 +105,8 @@ jobs: uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" + ARTIFACT_NAME: 'artifact-levanter-build' + BADGE_FILENAME: 'badge-levanter-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: levanter @@ -118,8 +118,8 @@ jobs: uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" + ARTIFACT_NAME: 'artifact-t5x-build' + BADGE_FILENAME: 'badge-t5x-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-t5x @@ -164,8 +164,8 @@ jobs: uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-grok-build" - BADGE_FILENAME: "badge-grok-build" + ARTIFACT_NAME: 'artifact-grok-build' + BADGE_FILENAME: 'badge-grok-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: grok @@ -174,7 +174,7 @@ jobs: collect-docker-tags: runs-on: ubuntu-22.04 - if: "!cancelled()" + if: '!cancelled()' needs: - build-base - build-jax @@ -227,58 +227,58 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit + # test-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # test-jax.sh -b gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-gpu.log + # secrets: inherit # test-equinox: # needs: build-equinox @@ -304,157 +304,157 @@ jobs: # test-equinox.log # secrets: inherit - test-te-multigpu: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-te-multigpu: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-pallas: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: pallas - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-pallas.log - python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml) - errors=$(./yq '.testsuites."+@errors"' pallas_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-pallas.log - secrets: inherit + # test-pallas: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: pallas + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-pallas.log + # python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml) + # errors=$(./yq '.testsuites."+@errors"' pallas_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-pallas.log + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install pytest - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install pytest + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit - test-te: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - pip install pytest-reportlog - pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-te.log - pytest-report.jsonl - secrets: inherit + # test-te: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # pip install pytest-reportlog + # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # secrets: inherit - test-upstream-pax: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-pax: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-pax: - needs: build-rosetta-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_pax_rosetta.yaml - with: - PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-pax: + # needs: build-rosetta-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_pax_rosetta.yaml + # with: + # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-upstream-maxtext: - needs: build-upstream-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-maxtext: + # needs: build-upstream-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit From b876ccfef4005f4216f147cecf76f7bb9d686ffb Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 26 Apr 2024 15:55:01 +0200 Subject: [PATCH 11/11] test --- .github/workflows/ci.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f93a6535b..fcaf495a6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,7 +2,7 @@ name: CI on: schedule: - - cron: "30 9 * * *" # Pacific Time 01:30 AM in UTC + - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC pull_request: types: - opened @@ -10,7 +10,7 @@ on: - ready_for_review - synchronize paths-ignore: - - "**.md" + - '**.md' workflow_dispatch: inputs: PUBLISH: @@ -25,7 +25,7 @@ on: required: false MERGE_BUMPED_MANIFEST: type: boolean - description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch" + description: '(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch' default: false required: false @@ -80,7 +80,7 @@ jobs: id: manifest-branch shell: bash -x -e {0} run: | - BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'false' }} + BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'true' }} MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }} # Prepend nightly manifest branch with "z" to make it appear at the end if [[ "$BUMP_MANIFEST" == "true" ]]; then @@ -183,7 +183,7 @@ jobs: name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} path: .github/container/ - - name: "Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}" + - name: 'Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}' id: local_branch shell: bash -x -e {0} run: | @@ -243,7 +243,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}" + - name: 'Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}' if: steps.create_pr.outcome == 'success' run: | echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY @@ -380,7 +380,7 @@ jobs: finalize: needs: [metadata, amd64, arm64, publish-containers] - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_finalize.yaml with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}