From 25ca69e8f19aab0ea954d78dad78fff186bd2eef Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Wed, 17 Apr 2024 20:49:22 +0200
Subject: [PATCH 01/11] refactor: `build-maxtext` to `build-upstream-maxtext`

---
 .github/workflows/_ci.yaml | 25 ++++++++++++-------------
 README.md                  | 10 +++++-----
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 589a42d3b..cd6f5bd8a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -14,7 +14,7 @@ on:
       MANIFEST_ARTIFACT_NAME:
         type: string
         description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch
-        default: ''
+        default: ""
         required: false
     outputs:
       DOCKER_TAGS:
@@ -22,12 +22,11 @@ on:
         value: ${{ jobs.collect-docker-tags.outputs.TAGS }}
 
 permissions:
-  contents: read  # to fetch code
-  actions:  write # to cancel previous workflows
+  contents: read # to fetch code
+  actions: write # to cancel previous workflows
   packages: write # to upload container
 
 jobs:
-
   build-base:
     uses: ./.github/workflows/_build_base.yaml
     with:
@@ -77,7 +76,7 @@ jobs:
       DOCKERFILE: .github/container/Dockerfile.equinox
     secrets: inherit
 
-  build-maxtext:
+  build-upstream-maxtext:
     needs: build-jax
     if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
     uses: ./.github/workflows/_build.yaml
@@ -162,7 +161,7 @@ jobs:
       CONTAINER_NAME: grok
       DOCKERFILE: .github/container/Dockerfile.grok
     secrets: inherit
-    
+
   collect-docker-tags:
     runs-on: ubuntu-22.04
     if: "!cancelled()"
@@ -171,7 +170,7 @@ jobs:
       - build-jax
       - build-triton
       - build-equinox
-      - build-maxtext
+      - build-upstream-maxtext
       - build-levanter
       - build-upstream-t5x
       - build-upstream-pax
@@ -190,7 +189,7 @@ jobs:
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
@@ -200,7 +199,7 @@ jobs:
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
@@ -369,7 +368,7 @@ jobs:
 
   test-levanter:
     needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
     uses: ./.github/workflows/_test_unit.yaml
     with:
       TEST_NAME: levanter
@@ -396,7 +395,7 @@ jobs:
 
   test-te:
     needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
     uses: ./.github/workflows/_test_unit.yaml
     with:
       TEST_NAME: te
@@ -439,9 +438,9 @@ jobs:
     secrets: inherit
 
   test-maxtext:
-    needs: build-maxtext
+    needs: build-upstream-maxtext
     if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
     uses: ./.github/workflows/_test_maxtext.yaml
     with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+      MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
diff --git a/README.md b/README.md
index e8d598626..5108515c3 100644
--- a/README.md
+++ b/README.md
@@ -215,19 +215,19 @@
     <tr>
       <td>
         <picture>
-          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=MaxText%3D%7Bcore%2CMaxText%7D">
+          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=Upstream MaxText%3D%7Bcore%2CMaxText%7D">
         </picture>
       </td>
       <td>
-        <code>ghcr.io/nvidia/jax:maxtext</code>
+        <code>ghcr.io/nvidia/jax:upstream-maxtext</code>
       </td>
       <td>
-        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
-        <!-- <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-arm64.json&logo=docker&label=arm64"> -->
+        <!-- <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
+         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-arm64.json&logo=docker&label=arm64"> -->
       </td>
       <td>
         <picture>
-          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-test.json&logo=nvidia&label=A100%20distributed">
+          <!--<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-test.json&logo=nvidia&label=A100%20distributed">-->
         </picture>
       </td>
     </tr>

From 1da45d29bc79140ac30f616ede56e8f5a081e7a7 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Thu, 18 Apr 2024 14:59:02 +0200
Subject: [PATCH 02/11] chore: Add `.gitignore`

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..51b4cfda1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+local/

From 5acefb49be994b68a668640786a6b4d5d25bfff0 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Thu, 18 Apr 2024 21:19:38 +0200
Subject: [PATCH 03/11] feat: Add rosetta-maxtext Dockerfile

---
 rosetta/Dockerfile.maxtext | 77 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 rosetta/Dockerfile.maxtext

diff --git a/rosetta/Dockerfile.maxtext b/rosetta/Dockerfile.maxtext
new file mode 100644
index 000000000..936fbefa1
--- /dev/null
+++ b/rosetta/Dockerfile.maxtext
@@ -0,0 +1,77 @@
+# syntax=docker/dockerfile:1-labs
+ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-pax
+ARG GIT_USER_EMAIL=jax@nvidia.com
+ARG GIT_USER_NAME=NVIDIA
+# If set to "true", then will pull new local patches, the manifest.yaml and create-distribution.sh (in case it was updated).
+# This is useful for development if you run `./bump.sh -i manifest.yaml` manually and do not want to trigger a full rebuild all
+# the way up to the jax build.
+ARG UPDATE_PATCHES=false
+# It is common for TE developers to test a different TE against the LLM application. This is a knob to override what's in the manifest
+# Accepts git-ref's from NVIDIA/TransformerEngine or pull requests (pull/$number/head)
+ARG UPDATED_TE_REF=""
+
+# Rosetta and optionally patches are pulled from this
+FROM scratch AS jax-toolbox
+
+###############################################################################
+### Download source and add auxiliary scripts
+################################################################################
+
+FROM ${BASE_IMAGE} AS mealkit
+ARG GIT_USER_EMAIL
+ARG GIT_USER_NAME
+ARG UPDATE_PATCHES
+ARG UPDATED_TE_REF
+
+ENV ENABLE_TE=1
+
+RUN --mount=target=/mnt/jax-toolbox,from=jax-toolbox <<"EOF" bash -exu
+MANIFEST_DIR=$(dirname ${MANIFEST_FILE})
+if [[ "${UPDATE_PATCHES}" != "true" && "${UPDATE_PATCHES}" != "false" ]]; then
+  echo "UPDATE_PATCHES can only be true or false"
+  exit 1
+fi
+if [[ "${UPDATE_PATCHES}" == "true" ]]; then
+  cp -r /mnt/jax-toolbox/.github/container/patches ${MANIFEST_DIR}/
+  cp /mnt/jax-toolbox/.github/container/manifest.yaml ${MANIFEST_DIR}/manifest.yaml
+  cp /mnt/jax-toolbox/.github/container/create-distribution.sh ${MANIFEST_DIR}/create-distribution.sh
+  # TODO: remove
+  cp /mnt/jax-toolbox/.github/container/pip-finalize.sh /usr/local/bin/
+fi
+cp -r /mnt/jax-toolbox/rosetta /opt/rosetta
+
+if [[ -n "${UPDATED_TE_REF}" ]]; then
+  TE_INSTALL_DIR=/opt/transformer-engine
+  yq e ".transformer-engine.latest_verified_commit = \"${UPDATED_TE_REF}\"" -i $MANIFEST_FILE
+  # Install from source instead of pre-built wheel
+  sed -i -E 's@( file:///opt/transformer-engine)/dist/[^ ]*@\1@' /opt/pip-tools.d/requirements-te.in
+  git -C $TE_INSTALL_DIR fetch -a
+  if [[ "${UPDATED_TE_REF}" =~ ^pull/ ]]; then
+    PR_ID=$(cut -d/ -f2 <<<"${UPDATED_TE_REF}")
+    git -C $TE_INSTALL_DIR fetch origin ${UPDATED_TE_REF}:PR-${PR_ID}
+    git -C $TE_INSTALL_DIR checkout PR-${PR_ID}
+  else
+    git -C $TE_INSTALL_DIR checkout ${UPDATED_TE_REF}
+  fi
+fi
+
+# Setting the username/email is required to author commits from patches
+git config --global user.email "${GIT_USER_EMAIL}"
+git config --global user.name "${GIT_USER_NAME}"
+
+bash ${MANIFEST_DIR}/create-distribution.sh \
+  --manifest ${MANIFEST_FILE} \
+  --package maxtext
+# Remove .gitconfig to avoid end-user authoring commits as the "build user"
+rm -f ~/.gitconfig
+EOF
+
+WORKDIR /opt/rosetta
+
+###############################################################################
+### Install accumulated packages from the base image and the previous stage
+################################################################################
+
+FROM mealkit as final
+
+RUN pip-finalize.sh

From 4fc41f3bba758962837193fe4289098651bbf534 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 19 Apr 2024 13:11:37 +0200
Subject: [PATCH 04/11] ci: Add `rosetta-maxtext` build

---
 .github/workflows/_ci.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index cd6f5bd8a..8c7ef1f4f 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -90,6 +90,16 @@ jobs:
       DOCKERFILE: .github/container/Dockerfile.maxtext.amd64
     secrets: inherit
 
+  build-rosetta-maxtext:
+    needs: build-upstream-maxtext
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: maxtext
+    secrets: inherit
+
   build-levanter:
     needs: [build-jax]
     uses: ./.github/workflows/_build.yaml

From b8699ca47cf12f294e2c899c048867ec90405e7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <koenig.oliver@icloud.com>
Date: Sat, 20 Apr 2024 21:36:23 +0200
Subject: [PATCH 05/11] Update rosetta/Dockerfile.maxtext

Co-authored-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/_ci.yaml | 4 +++-
 rosetta/Dockerfile.maxtext | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 8c7ef1f4f..1295f5bc6 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -86,7 +86,7 @@ jobs:
       BADGE_FILENAME: badge-maxtext-build
       BUILD_DATE: ${{ inputs.BUILD_DATE }}
       BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
+      CONTAINER_NAME: upstream-maxtext
       DOCKERFILE: .github/container/Dockerfile.maxtext.amd64
     secrets: inherit
 
@@ -184,8 +184,10 @@ jobs:
       - build-levanter
       - build-upstream-t5x
       - build-upstream-pax
+      - build-upstream-maxtext
       - build-rosetta-t5x
       - build-rosetta-pax
+      - build-rosetta-maxtext
       - build-grok
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
diff --git a/rosetta/Dockerfile.maxtext b/rosetta/Dockerfile.maxtext
index 936fbefa1..6c69ed93a 100644
--- a/rosetta/Dockerfile.maxtext
+++ b/rosetta/Dockerfile.maxtext
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1-labs
-ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-pax
+ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-maxtext
 ARG GIT_USER_EMAIL=jax@nvidia.com
 ARG GIT_USER_NAME=NVIDIA
 # If set to "true", then will pull new local patches, the manifest.yaml and create-distribution.sh (in case it was updated).
@@ -35,8 +35,6 @@ if [[ "${UPDATE_PATCHES}" == "true" ]]; then
   cp -r /mnt/jax-toolbox/.github/container/patches ${MANIFEST_DIR}/
   cp /mnt/jax-toolbox/.github/container/manifest.yaml ${MANIFEST_DIR}/manifest.yaml
   cp /mnt/jax-toolbox/.github/container/create-distribution.sh ${MANIFEST_DIR}/create-distribution.sh
-  # TODO: remove
-  cp /mnt/jax-toolbox/.github/container/pip-finalize.sh /usr/local/bin/
 fi
 cp -r /mnt/jax-toolbox/rosetta /opt/rosetta
 

From c2525cfbf462658e56bb5ed659ca0f29612d64a5 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 22 Apr 2024 14:26:57 +0200
Subject: [PATCH 06/11] ci: Publish maxtext upstream & rosetta images

---
 .github/workflows/_ci.yaml |  6 ++++--
 .github/workflows/ci.yaml  | 31 +++++++++++++++----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 1295f5bc6..522c51474 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -201,20 +201,22 @@ jobs:
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "grok",         "stage": "final",   "priority": 900,  "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "grok",         "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_MEALKIT }}"},\
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 0098b83bf..f93a6535b 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -2,7 +2,7 @@ name: CI
 
 on:
   schedule:
-    - cron: '30 9 * * *'  # Pacific Time 01:30 AM in UTC
+    - cron: "30 9 * * *" # Pacific Time 01:30 AM in UTC
   pull_request:
     types:
       - opened
@@ -10,7 +10,7 @@ on:
       - ready_for_review
       - synchronize
     paths-ignore:
-      - '**.md'
+      - "**.md"
   workflow_dispatch:
     inputs:
       PUBLISH:
@@ -34,16 +34,15 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
 permissions:
-  contents: write       # to fetch code and push branch
-  actions:  write       # to cancel previous workflows
-  packages: write       # to upload container
-  pull-requests: write  # to make pull request for manifest bump
+  contents: write # to fetch code and push branch
+  actions: write # to cancel previous workflows
+  packages: write # to upload container
+  pull-requests: write # to make pull request for manifest bump
 
 env:
   DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest
 
 jobs:
-
   metadata:
     runs-on: ubuntu-22.04
     outputs:
@@ -115,7 +114,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           bash bump.sh --input-manifest manifest.yaml --output-manifest manifest.yaml.new --base-patch-dir ./patches-new
-      
+
       - name: Maybe replace current manifest/patches with the new one and show diff
         working-directory: .github/container
         shell: bash -x -e {0}
@@ -168,12 +167,11 @@ jobs:
     steps:
       - name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}"
         id: test_result
-        run:
-          echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT
+        run: echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT
 
       - name: Check out the repository under ${GITHUB_WORKSPACE}
         uses: actions/checkout@v4
-      
+
       - name: Delete checked-out manifest and patches
         run: |
           rm .github/container/manifest.yaml
@@ -213,7 +211,7 @@ jobs:
           git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
           # Push the new change
           git push origin ${{ github.ref_name }}
-      
+
       # We will create a Draft PR & remote branch if:
       #  1. The tests failed
       #  2. The merge failed
@@ -244,12 +242,12 @@ jobs:
           draft: true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      
+
       - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}"
         if: steps.create_pr.outcome == 'success'
         run: |
           echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY
-      
+
       # Guard delete in simple check to protect other branches
       - name: Check that the branch matches znightly- prefix
         run: |
@@ -271,7 +269,7 @@ jobs:
 
   make-publish-configs:
     runs-on: ubuntu-22.04
-    if:  ${{ !cancelled() }}
+    if: ${{ !cancelled() }}
     env:
       MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }}
       FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }}
@@ -294,6 +292,7 @@ jobs:
             levanter
             upstream-t5x
             upstream-pax
+            upstream-maxtext
             t5x
             pax
             grok
@@ -365,7 +364,7 @@ jobs:
     needs:
       - metadata
       - make-publish-configs
-    if:  ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }}
+    if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }}
     strategy:
       fail-fast: false
       matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }}

From 2e27a1796390e7c2cb4bee97d9a1e421adae98c6 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 22 Apr 2024 15:17:38 +0200
Subject: [PATCH 07/11] ci: Refactor maxtest tests

To comply with naming convention
---
 .github/workflows/_ci.yaml                                    | 4 ++--
 .../{_test_maxtext.yaml => _test_upstream_maxtext.yaml}       | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename .github/workflows/{_test_maxtext.yaml => _test_upstream_maxtext.yaml} (100%)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 522c51474..c28f944c6 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -451,10 +451,10 @@ jobs:
       PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
 
-  test-maxtext:
+  test-upstream-maxtext:
     needs: build-upstream-maxtext
     if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_maxtext.yaml
+    uses: ./.github/workflows/_test_upstream_maxtext.yaml
     with:
       MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_upstream_maxtext.yaml
similarity index 100%
rename from .github/workflows/_test_maxtext.yaml
rename to .github/workflows/_test_upstream_maxtext.yaml

From ecc56287e9ad9cb997052acd698027ce4436dc7a Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 22 Apr 2024 21:17:18 +0200
Subject: [PATCH 08/11] docs: Update README

---
 README.md | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5108515c3..fb4ae772d 100644
--- a/README.md
+++ b/README.md
@@ -222,12 +222,31 @@
         <code>ghcr.io/nvidia/jax:upstream-maxtext</code>
       </td>
       <td>
-        <!-- <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
-         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-build-arm64.json&logo=docker&label=arm64"> -->
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
+         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-build-arm64.json&logo=docker&label=arm64">
       </td>
       <td>
         <picture>
-          <!--<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-maxtext-test.json&logo=nvidia&label=A100%20distributed">-->
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-test.json&logo=nvidia&label=A100%20distributed">
+        </picture>
+      </td>
+    </tr>
+    <tr>
+      <td>
+        <picture>
+          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=Rosetta MaxText%3D%7Bcore%2CMaxText%7D">
+        </picture>
+      </td>
+      <td>
+        <code>ghcr.io/nvidia/jax:maxtext</code>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
+         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-maxtext-build-arm64.json&logo=docker&label=arm64">
+      </td>
+      <td>
+        <picture>
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-maxtext-test.json&logo=nvidia&label=A100%20distributed">
         </picture>
       </td>
     </tr>

From 2998718e604afb14c345026ccd93140d8814475a Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 22 Apr 2024 21:18:09 +0200
Subject: [PATCH 09/11] ci: Fix name of badge for upstream-maxtext

---
 .github/workflows/_test_upstream_maxtext.yaml | 25 +++++++++----------
 README.md                                     |  6 ++---
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/_test_upstream_maxtext.yaml b/.github/workflows/_test_upstream_maxtext.yaml
index 61bee91d6..77589d479 100644
--- a/.github/workflows/_test_upstream_maxtext.yaml
+++ b/.github/workflows/_test_upstream_maxtext.yaml
@@ -11,13 +11,13 @@ on:
       EXTRA_TEST_ARGS:
         type: string
         description: Extra command line args to pass to test-maxtext.sh
-        default: ""
+        default: ''
         required: false
       BADGE_FILENAME:
         type: string
         description: 'Name of the endpoint JSON file for shields.io badge'
         required: false
-        default: 'badge-maxtext-test.json'
+        default: 'badge-upstream-maxtext-test.json'
       ARTIFACT_NAME:
         type: string
         description: 'Name of the artifact zip file'
@@ -34,12 +34,11 @@ on:
         value: ${{ jobs.sitrep.outputs.STATUS }}
 
 jobs:
-
   single-process-multi-device:
     strategy:
       matrix:
         PARALLEL_CONFIG:
-        - [1, 1, 2, 4]
+          - [1, 1, 2, 4]
         # - [1, 1, 1, 8] # PP, DP, FSDP, TP
       fail-fast: false
 
@@ -183,12 +182,12 @@ jobs:
     strategy:
       matrix:
         PARALLEL_CONFIG:
-        - [1, 1, 1, 1]
-        - [1, 1, 8, 1]
-        - [1, 1, 1, 8]
-        - [1, 1, 4, 2]
-        - [1, 2, 2, 2]
-        - [1, 4, 2, 2]
+          - [1, 1, 1, 1]
+          - [1, 1, 8, 1]
+          - [1, 1, 1, 8]
+          - [1, 1, 4, 2]
+          - [1, 2, 2, 2]
+          - [1, 4, 2, 2]
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -366,7 +365,7 @@ jobs:
 
   sitrep:
     needs: [single-process-multi-device, maxtext-multinode, metrics]
-    if: "!cancelled()"
+    if: '!cancelled()'
     uses: ./.github/workflows/_sitrep_mgmn.yaml
     secrets: inherit
     with:
@@ -377,7 +376,7 @@ jobs:
   summary:
     runs-on: ubuntu-22.04
     needs: [single-process-multi-device, maxtext-multinode]
-    if: "!cancelled()"
+    if: '!cancelled()'
     steps:
       - name: Generate TensorBoard query URL
         run: |
@@ -394,7 +393,7 @@ jobs:
   outcome:
     needs: sitrep
     runs-on: ubuntu-22.04
-    if: "!cancelled()"
+    if: '!cancelled()'
     steps:
       - name: Sets workflow status based on test outputs
         run: |
diff --git a/README.md b/README.md
index fb4ae772d..fe7a960ca 100644
--- a/README.md
+++ b/README.md
@@ -222,7 +222,7 @@
         <code>ghcr.io/nvidia/jax:upstream-maxtext</code>
       </td>
       <td>
-        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-upstream-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-upstream-maxtext-build-arm64.json&logo=docker&label=arm64">
       </td>
       <td>
@@ -241,8 +241,8 @@
         <code>ghcr.io/nvidia/jax:maxtext</code>
       </td>
       <td>
-        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-maxtext-build-amd64.json&logo=docker&label=amd64"></a>
-         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-maxtext-build-arm64.json&logo=docker&label=arm64">
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-md"><img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-build-amd64.json&logo=docker&label=amd64"></a>
+         <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-rosetta-build-arm64.json&logo=docker&label=arm64">
       </td>
       <td>
         <picture>

From a4564f7a9c037d6f6e2c6d8b16462b601b8bded9 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 26 Apr 2024 15:26:50 +0200
Subject: [PATCH 10/11] test

---
 .github/container/manifest.yaml |   3 +
 .github/workflows/_ci.yaml      | 408 ++++++++++++++++----------------
 2 files changed, 207 insertions(+), 204 deletions(-)

diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml
index 712b13b55..0aede0826 100644
--- a/.github/container/manifest.yaml
+++ b/.github/container/manifest.yaml
@@ -113,9 +113,12 @@ jax-triton:
   mode: git-clone
 maxtext:
   url: https://github.com/google/maxtext.git
+  mirror_url: https://github.com/nvjax-svc-0/maxtext.git
   tracking_ref: main
   latest_verified_commit: 78daad198544def8274dbd656d122fbe6a0e1129
   mode: git-clone
+  patches:
+    mirror/patch/test_rosetta_maxtext: file://patches/maxtext/mirror-patch-rosetta-maxtext.patch
 levanter:
   url: https://github.com/stanford-crfm/levanter.git
   tracking_ref: main
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index c28f944c6..203a58163 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -14,7 +14,7 @@ on:
       MANIFEST_ARTIFACT_NAME:
         type: string
         description: Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch
-        default: ""
+        default: ''
         required: false
     outputs:
       DOCKER_TAGS:
@@ -105,8 +105,8 @@ jobs:
     uses: ./.github/workflows/_build.yaml
     with:
       ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
+      ARTIFACT_NAME: 'artifact-levanter-build'
+      BADGE_FILENAME: 'badge-levanter-build'
       BUILD_DATE: ${{ inputs.BUILD_DATE }}
       BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       CONTAINER_NAME: levanter
@@ -118,8 +118,8 @@ jobs:
     uses: ./.github/workflows/_build.yaml
     with:
       ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
+      ARTIFACT_NAME: 'artifact-t5x-build'
+      BADGE_FILENAME: 'badge-t5x-build'
       BUILD_DATE: ${{ inputs.BUILD_DATE }}
       BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       CONTAINER_NAME: upstream-t5x
@@ -164,8 +164,8 @@ jobs:
     uses: ./.github/workflows/_build.yaml
     with:
       ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-grok-build"
-      BADGE_FILENAME: "badge-grok-build"
+      ARTIFACT_NAME: 'artifact-grok-build'
+      BADGE_FILENAME: 'badge-grok-build'
       BUILD_DATE: ${{ inputs.BUILD_DATE }}
       BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       CONTAINER_NAME: grok
@@ -174,7 +174,7 @@ jobs:
 
   collect-docker-tags:
     runs-on: ubuntu-22.04
-    if: "!cancelled()"
+    if: '!cancelled()'
     needs:
       - build-base
       - build-jax
@@ -227,58 +227,58 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
 
-  test-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
+  # test-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: jax
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee tee test-gpu.log
+  #         test-jax.sh -b gpu
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #       test-gpu.log
+  #   secrets: inherit
 
   # test-equinox:
   #   needs: build-equinox
@@ -304,157 +304,157 @@ jobs:
   #       test-equinox.log
   #   secrets: inherit
 
-  test-te-multigpu:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_te.yaml
-    with:
-      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-te-multigpu:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_te.yaml
+  #   with:
+  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-pallas:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?)
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: pallas
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-pallas.log
-          python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' pallas_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-pallas.log
-    secrets: inherit
+  # test-pallas:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?)
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: pallas
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-pallas.log
+  #         python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' pallas_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-pallas.log
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install pytest
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
+  # test-levanter:
+  #   needs: build-levanter
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: levanter
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install pytest
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-levanter.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-levanter.log
+  #   secrets: inherit
 
-  test-te:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          pip install pytest-reportlog
-          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)          
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-    secrets: inherit
+  # test-te:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         pip install pytest-reportlog
+  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #   secrets: inherit
 
-  test-upstream-pax:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_upstream_pax.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-pax:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_upstream_pax.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-pax:
-    needs: build-rosetta-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_pax_rosetta.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-pax:
+  #   needs: build-rosetta-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_pax_rosetta.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-upstream-maxtext:
-    needs: build-upstream-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_upstream_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-maxtext:
+  #   needs: build-upstream-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_upstream_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit

From b876ccfef4005f4216f147cecf76f7bb9d686ffb Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 26 Apr 2024 15:55:01 +0200
Subject: [PATCH 11/11] test

---
 .github/workflows/ci.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f93a6535b..fcaf495a6 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -2,7 +2,7 @@ name: CI
 
 on:
   schedule:
-    - cron: "30 9 * * *" # Pacific Time 01:30 AM in UTC
+    - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC
   pull_request:
     types:
       - opened
@@ -10,7 +10,7 @@ on:
       - ready_for_review
       - synchronize
     paths-ignore:
-      - "**.md"
+      - '**.md'
   workflow_dispatch:
     inputs:
       PUBLISH:
@@ -25,7 +25,7 @@ on:
         required: false
       MERGE_BUMPED_MANIFEST:
         type: boolean
-        description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch"
+        description: '(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch'
         default: false
         required: false
 
@@ -80,7 +80,7 @@ jobs:
         id: manifest-branch
         shell: bash -x -e {0}
         run: |
-          BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'false' }}
+          BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'true' }}
           MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }}
           # Prepend nightly manifest branch with "z" to make it appear at the end
           if [[ "$BUMP_MANIFEST" == "true" ]]; then
@@ -183,7 +183,7 @@ jobs:
           name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
           path: .github/container/
 
-      - name: "Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}"
+      - name: 'Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}'
         id: local_branch
         shell: bash -x -e {0}
         run: |
@@ -243,7 +243,7 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}"
+      - name: 'Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}'
         if: steps.create_pr.outcome == 'success'
         run: |
           echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY
@@ -380,7 +380,7 @@ jobs:
 
   finalize:
     needs: [metadata, amd64, arm64, publish-containers]
-    if: "!cancelled()"
+    if: '!cancelled()'
     uses: ./.github/workflows/_finalize.yaml
     with:
       BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}