diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 2c4946bbbde1..ae22ede4807b 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,13 +14,13 @@ Add a one line overview of what this PR aims to accomplish.
 # Add a code snippet demonstrating how to use this 
 ```
 
-# Jenkins CI
+# GitHub Actions CI
 
 The Jenkins CI system has been replaced by GitHub Actions self-hosted runners.
 
-There's no need to comment `jenkins` on the PR to trigger Jenkins CI.
-The GitHub Actions CI will run automatically when the PR is opened.
-To run CI on an untrusted fork, a NeMo user with write access must click "Approve and run".
+The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR.
+To re-run CI remove and add the label again.
+To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run".
 
 # Before your PR is "Ready for review"
 **Pre checks**:
diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh
new file mode 100644
index 000000000000..4696cebcf13b
--- /dev/null
+++ b/.github/scripts/slackHelper.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+function sendSlackMessage() {
+
+  WEBHOOK_URL="$1"
+  PIPELINE_URL="$2"
+
+  curl -X POST -H "Content-type: application/json" --data "{
+      \"blocks\": [
+        {
+			\"type\": \"section\",
+			\"text\": {
+				\"type\": \"mrkdwn\",
+				\"text\": \"\
+🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*:
+
+\"
+			}
+		}
+      ]
+    }" $WEBHOOK_URL
+
+}
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
new file mode 100644
index 000000000000..31e9452d0fe5
--- /dev/null
+++ b/.github/workflows/_test_template.yml
@@ -0,0 +1,58 @@
+name: ~test template
+
+on:
+  workflow_call:
+    inputs:
+      RUNNER:
+        type: string
+        description: Runner to use for test
+        required: true
+      TIMEOUT:
+        type: number
+        description: Max runtime of test in minutes
+        required: false
+        default: 10
+      SCRIPT:
+        type: string
+        description: Test script to execute
+        required: true
+      AFTER_SCRIPT:
+        type: string
+        description: Script to run after main test
+        required: false
+        default: ":"
+      IS_OPTIONAL:
+        type: boolean
+        description: Failure will cancel all other tests if set to true
+        required: false
+        default: false
+    outputs:
+      conclusion:
+        description: Conclusion of main test step
+        value: ${{ jobs.main.outputs.conclusion }}
+
+jobs:
+  main:
+    runs-on: ${{ inputs.RUNNER }} 
+    timeout-minutes: ${{ inputs.TIMEOUT }}
+    outputs:
+      conclusion: ${{ steps.main.conclusion }}
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - id: main
+          run: ${{ inputs.SCRIPT }}
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: failure() && inputs.IS_OPTIONAL == false
+        - name: after_script
+          if: always() && inputs.AFTER_SCRIPT != ':'
+          run: ${{ inputs.AFTER_SCRIPT }}
\ No newline at end of file
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
deleted file mode 100644
index bdfb24c4b1e5..000000000000
--- a/.github/workflows/blossom-ci.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# A workflow to trigger ci on hybrid infra (github + self hosted runner)
-name: Blossom-CI
-on:
-  issue_comment:
-    types: [created]
-  workflow_dispatch:
-      inputs:
-          platform:
-            description: 'runs-on argument'     
-            required: false
-          args:
-            description: 'argument'     
-            required: false
-jobs:
-  Authorization:
-    name: Authorization
-    runs-on: blossom 
-    outputs:
-      args: ${{ env.args }}
-      
-    # This job only runs for pull request comments
-    if: |
-         contains( 'okuchaiev,ericharper,titu1994,MaximumEntropy,nithinraok,redoctopus,yidong72,SeanNaren,yzhang123,ekmb,arendu,', format('{0},', github.actor)) && 
-         github.event.comment.body == '/blossom-ci'  
-    steps:
-      - name: Check if comment is issued by authorized person
-        run: blossom-ci
-        env:
-          OPERATION: 'AUTH'
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        
-  Vulnerability-scan:
-    name: Vulnerability scan
-    needs: [Authorization]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
-          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
-          lfs: 'true'
-      
-      # repo specific steps 
-      #- name: Setup java
-      #  uses: actions/setup-java@v1
-      #  with:
-      #    java-version: 1.8
-      
-      # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
-      #- name: Setup blackduck properties
-      #  run: |
-      #       PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
-      #       echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
-      #       echo detect.maven.included.scopes=compile >> application.properties
-          
-      - name: Run blossom action
-        uses: NVIDIA/blossom-action@main
-        env:
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        with:
-          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
-          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
-          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
-          
-  Job-trigger:
-    name: Start ci job
-    needs: [Vulnerability-scan]
-    runs-on: blossom
-    steps:
-      - name: Start ci job
-        run: blossom-ci
-        env:
-          OPERATION: 'START-CI-JOB'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-              
-  Upload-Log:
-    name: Upload log
-    runs-on: blossom
-    if : github.event_name == 'workflow_dispatch'
-    steps:
-      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
-        run: blossom-ci
-        env:
-          OPERATION: 'POST-PROCESSING'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e6e8fb808943..12b8cdcb8eed 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,7 +15,9 @@ name: "CICD NeMo"
 
 on:
   pull_request:
-    branches: [ "main" ]
+    branches:
+      - 'main'
+      - 'r**'
     types: [ labeled ]
 
 concurrency:
@@ -41,550 +43,420 @@ jobs:
         docker container prune --filter "until=24h" --force
         docker image prune -a --filter "until=24h" --force
 
-#  checkout-repository:
-#    runs-on: self-hosted-azure
-#    container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
-#      volumes:
-#        - ${{ github.workspace }}:/workspace
-#    steps:
-#    - name: Checkout repository
-#      uses: actions/checkout@v4
-#      with:
-#        path: ${{ github.run_id }}
-
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
     if: ${{ github.event.label.name == 'Run CICD' }}
-    # uses: actions/cache@v2
-    #container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
-#      options: 
-#        # --user 0:128
-#        --device=/dev/nvidia0
-#        --gpus all
-#        --shm-size=8g 
-#        --env TRANSFORMERS_OFFLINE=0
-#        --env HYDRA_FULL_ERROR=1
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
       with:
         path: ${{ github.run_id }}
-
-    - name: Container setup
-      run: |
-        # Pull base PyTorch container
-        docker pull nvcr.io/nvidia/pytorch:24.02-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
-            set -x
-
-            # PyTorch version
-            python -c "import torch; print(torch.__version__)"
-            python -c "import torchvision; print(torchvision.__version__)"
-
-            # Install test requirements
-            apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt
-
-            # Code formatting checks
-            python setup.py style
-
-            # Copyright Headers check
-            python tests/check_copyright_header.py --dir .
-
-            # NeMo Installation
-            ./reinstall.sh release
-
-            # Transformer Engine installation
-            git clone https://github.com/NVIDIA/TransformerEngine.git && \
-                pushd TransformerEngine && \
-                git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
-                git checkout FETCH_HEAD && \
-                git submodule init && git submodule update && \
-                NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
-                popd
-
-            # Apex installation
-            git clone https://github.com/NVIDIA/apex.git && \
-                pushd apex && \
-                git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
-                cp -R apex /usr/local/lib/python3.10/dist-packages && \
-                popd
-
-            # pip package should be working with main, if not we can update the commit here
-            # until the pip package is updated
-            # Megatron Core installation
-            git clone https://github.com/NVIDIA/Megatron-LM.git && \
-                pushd Megatron-LM && \
-                git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
-                pip install . && \
-                  pushd megatron/core/datasets && \
-                  make && \
-                  popd && \
-                popd
-            export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
-
-            # Install only for test: L2: Segmentation Tool
-            pushd tools/ctc_segmentation && \
-                pip install -r requirements.txt && \
-                apt-get update && apt-get install libsox-fmt-all -y && \
-                popd
-
-            # PyTorch Lightning version
-            python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
-
-            # PyTorch Lightning DDP Checks
-            CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
-
-            # Basic Import Checks
-            python -c "import nemo.collections.asr as nemo_asr"
-            python -c "import nemo.collections.nlp as nemo_nlp"
-            python -c "import nemo.collections.tts as nemo_tts"
-
-            # set permission
-            chmod 777 -R /workspace
-            '
-            ### \'\'
-
-    - name: Push container to registry for future use
+    
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with: 
+        # We use `docker` driver as this speeds things up for 
+        # trivial (non-multi-stage) builds.
+        driver: docker
+
+    - name: Build and push
+      uses: docker/build-push-action@v5
+      with:
+        file: Dockerfile.ci
+        push: true
+        cache-from: nemoci.azurecr.io/nemo_container:latest
+        cache-to: type=inline
+        tags: |
+          nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+          nemoci.azurecr.io/nemo_container:latest
+
+    - name: Run some checks
       run: |
-        # Push container
-        echo "Docker: List containers" && docker ps -a
-        DOCKER_COMMIT=$(docker ps --latest --quiet)  # latest container
-        docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-        docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-        docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-
-    # - name: Build and push to local registry
-    #   uses: docker/build-push-action@v5
-    #   with:
-    #       context: .
-    #       push: true
-    #       tags: nemoci.azurecr.io/name/app:latest
-
-    # - name: Inspect
-    #   run: |
-    #     docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest
+        docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
+          # PyTorch Lightning version
+          python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
+
+          # PyTorch Lightning DDP Checks
+          CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
+
+          # Basic Import Checks
+          python -c "import nemo.collections.asr as nemo_asr"
+          python -c "import nemo.collections.nlp as nemo_nlp"
+          python -c "import nemo.collections.tts as nemo_tts"
+
+          python setup.py style
+          python tests/check_copyright_header.py --dir .
 
-    #- name: Post-workflow execution
-    #  uses: gacts/run-and-post-run@v1
-    #  with:
-    #    post: |
-    #      chmod -R 777 .
+          # These checks are not crucial
+          exit 0
+        '
+        ### \'\'
 
 
-  L0_Unit_Tests_GPU:
+  OPTIONAL_L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: "L0: Unit Tests GPU"
-      run: |
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      TIMEOUT: 30
+      SCRIPT: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-      if: "failure()"
-      
+      IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-cpu
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: "L0: Unit Tests CPU"
-      run: |
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-cpu
+      TIMEOUT: 60
+      SCRIPT: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-      if: "failure()"
-
 
+  L0_Setup_Test_Data_And_Models:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python -m tests.setup --save_dir /home/TestData/nlp
 
-##     - name: L2: Multimodal Imagen Train
+  ##     - name: L2: Multimodal Imagen Train
 
   # L2: Community LLM Checkpoints tests
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
-            --precision=16
-            rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
+          --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          --precision=16
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/model_weights
+
+  L2_Community_LLM_Checkpoints_tests_Llama3:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
+          --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
+          --precision=16          
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
+        rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights
 
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
-            rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }};
+        python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
+        --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
+        --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
 
   L2_Community_LLM_Checkpoints_tests_Falcon:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-            rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+          python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
+          --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+          --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+          rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
+        
+  # this test is using a 7B model which is too large for GitHub CI
+  # replace the model in this test with a toy model or move the test
+  # to the nightly CI
+  # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
+  #           --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
+  #           --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+  #           rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #         if: "failure()"
 
-  L2_Community_LLM_Checkpoints_tests_Baichuan2:
+  L2_PTQ_Llama2_Export_Only:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-            rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          quantization.algorithm=null \
+          model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
+
+  L2_PTQ_Llama2_FP8:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          tensor_model_parallel_size=2 \
+          trainer.devices=2 \
+          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+          quantization.algorithm=fp8 \
+          quantization.num_calib_size=8 \
+          inference.batch_size=2 \
+          export.inference_tensor_parallel=2 \
+          model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+
+  L2_PTQ_Llama2_INT8_SQ:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+        model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+        quantization.algorithm=int8_sq \
+        quantization.num_calib_size=8 \
+        inference.batch_size=2 \
+        model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+
+  # TODO: investigate int4_awq stuck issues and restore the test
+  #L2_PTQ_Llama2_INT4_AWQ:
+  #  needs: [cicd-test-container-setup]
+  #  runs-on: self-hosted-azure
+  #  timeout-minutes: 10
+  #  container:
+  #    image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #    options:
+  #      # --user 0:128
+  #      --device=/dev/nvidia0
+  #      --gpus all
+  #      --shm-size=8g
+  #      --env TRANSFORMERS_OFFLINE=0
+  #      --env HYDRA_FULL_ERROR=1
+  #      --volume /mnt/datadrive/TestData:/home/TestData
+  #  steps:
+  #      - name: Checkout repository
+  #        uses: actions/checkout@v4
+  #      - run: |
+  #          python examples/nlp/language_modeling/megatron_quantization.py \
+  #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #          tensor_model_parallel_size=1 \
+  #          trainer.devices=1 \
+  #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+  #          quantization.algorithm=int4_awq \
+  #          quantization.num_calib_size=8 \
+  #          inference.batch_size=2 \
+  #          model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+  #
+  #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+        #  if: "failure()"
 
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=True \
+          exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+          rm -rf examples/asr/speech_to_text_results
 
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/citrinet/" --config-name="config_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
-            rm -rf examples/asr/speech_to_text_wpe_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+          --config-path="../conf/citrinet/" --config-name="config_bpe" \
+          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+          model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+          model.tokenizer.type="wpe" \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=True \
+          exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
+      AFTER_SCRIPT: |
+          rm -rf examples/asr/speech_to_text_wpe_results
 
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_pretraining/speech_pre_training.py \
-            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_pre_training_results
-            rm -rf examples/asr/speech_pre_training_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_pretraining/speech_pre_training.py \
+        --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_pre_training_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_pre_training_results
 
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results
-            rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  ASR_dev_run_Speech_To_Text_HF_Finetuning:
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_to_text_finetune.py \
+        --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+        model.tokenizer.update_tokenizer=False \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_finetuning_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_finetuning_results
+
+  OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
-            ~model.train_ds.hf_data_cfg \
-            model.train_ds.num_workers=1 \
-            model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
-            model.train_ds.streaming=true \
-            +model.train_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.train_ds.hf_data_cfg.name=null \
-            +model.train_ds.hf_data_cfg.split="test.clean" \
-            +model.train_ds.hf_data_cfg.streaming=true \
-            ~model.validation_ds.hf_data_cfg \
-            model.validation_ds.streaming=true \
-            +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.validation_ds.hf_data_cfg.name=null \
-            +model.validation_ds.hf_data_cfg.split="test.clean" \
-            +model.validation_ds.hf_data_cfg.streaming=true \
-            ~model.test_ds \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            model.optim.sched.warmup_steps=0 \
-            +model.optim.sched.max_steps=3 \
-            trainer.max_epochs=null \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results
-            rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |-
+        python examples/asr/speech_to_text_finetune.py \
+        --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
+        ~model.train_ds.hf_data_cfg \
+        model.train_ds.num_workers=1 \
+        model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
+        model.train_ds.streaming=true \
+        +model.train_ds.hf_data_cfg.path="librispeech_asr" \
+        +model.train_ds.hf_data_cfg.name=null \
+        +model.train_ds.hf_data_cfg.split="test.clean" \
+        +model.train_ds.hf_data_cfg.streaming=true \
+        ~model.validation_ds.hf_data_cfg \
+        model.validation_ds.streaming=true \
+        +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+        +model.validation_ds.hf_data_cfg.name=null \
+        +model.validation_ds.hf_data_cfg.split="test.clean" \
+        +model.validation_ds.hf_data_cfg.streaming=true \
+        ~model.test_ds \
+        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+        model.tokenizer.update_tokenizer=False \
+        model.optim.sched.warmup_steps=0 \
+        +model.optim.sched.max_steps=3 \
+        trainer.max_epochs=null \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_finetuning_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_finetuning_results
+      IS_OPTIONAL: true
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
-            rm -rf examples/asr/speech_to_text_wpe_conformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+        --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+        model.tokenizer.type="wpe" \
+        model.train_ds.batch_size=4 \
+        model.validation_ds.batch_size=4 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_wpe_conformer_results
 
   # L2: ASR dev run - part two
   ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.encoder.d_model=144 \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
-            rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+        --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+        model.tokenizer.type="wpe" \
+        model.encoder.d_model=144 \
+        model.train_ds.batch_size=4 \
+        model.validation_ds.batch_size=4 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
 
   L2_Speech_to_Text_EMA:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=2 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            +exp_manager.ema.enable=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=2 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        +exp_manager.ema.enable=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
+
 
   # L2_Speech_to_Text_AED:
   #   needs: [cicd-test-container-setup]
@@ -638,514 +510,315 @@ jobs:
   # L2: Speaker dev run
   L2_Speaker_dev_run_Speaker_Recognition:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/recognition/speaker_reco.py \
-            model.train_ds.batch_size=10 \
-            model.validation_ds.batch_size=2 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
-            model.decoder.num_classes=2 \
-            trainer.max_epochs=10 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
-            rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/speaker_tasks/recognition/speaker_reco.py \
+        model.train_ds.batch_size=10 \
+        model.validation_ds.batch_size=2 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
+        model.decoder.num_classes=2 \
+        trainer.max_epochs=10 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
 
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
-            model.diarizer.speaker_embeddings.model_path=titanet_large \
-            model.train_ds.batch_size=5 \
-            model.validation_ds.batch_size=5 \
-            model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
-            rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
+        model.diarizer.speaker_embeddings.model_path=titanet_large \
+        model.train_ds.batch_size=5 \
+        model.validation_ds.batch_size=5 \
+        model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+        model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+        model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
 
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results
-            rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_classification/speech_to_label.py \
+        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+        model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+        model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+        ~model.preprocessor.window_size \
+        ~model.preprocessor.window_stride \
+        ~model.preprocessor.window \
+        ~model.preprocessor.n_mels \
+        ~model.preprocessor.n_mfcc \
+        ~model.preprocessor.n_fft \
+        exp_manager.exp_dir=examples/asr/speech_to_label_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_label_results
 
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
-            diarizer.asr.model_path=QuartzNet15x5Base-En \
-            diarizer.asr.parameters.asr_based_vad=True \
-            diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
-            rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
+        diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
+        diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
+        diarizer.asr.model_path=QuartzNet15x5Base-En \
+        diarizer.asr.parameters.asr_based_vad=True \
+        diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
 
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |  
-            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=null \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
-            rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
+        diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
+        diarizer.speaker_embeddings.parameters.multiscale_weights=null \
+        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+        diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
 
   L2_Speaker_dev_run_Neural_Diarizer_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
-            rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+        diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
 
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python tools/speech_data_simulator/multispeaker_simulator.py \
-            --config-path=conf --config-name=data_simulator.yaml \
-            data_simulator.random_seed=42 \
-            data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
-            data_simulator.outputs.output_dir=./test_simulator \
-            data_simulator.session_config.num_sessions=2 \
-            data_simulator.session_config.session_length=60
-            rm -rf ./test_simulator
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tools/speech_data_simulator/multispeaker_simulator.py \
+        --config-path=conf --config-name=data_simulator.yaml \
+        data_simulator.random_seed=42 \
+        data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
+        data_simulator.outputs.output_dir=./test_simulator \
+        data_simulator.session_config.num_sessions=2 \
+        data_simulator.session_config.session_length=60
+      AFTER_SCRIPT: |
+        rm -rf ./test_simulator
 
   # L2: ASR Multi-dataloader dev run
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        +trainer.num_sanity_val_steps=1 \
+        exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
 
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results
-            rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_classification/speech_to_label.py \
+        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+        model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        +trainer.num_sanity_val_steps=1 \
+        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+        ~model.preprocessor.window_size \
+        ~model.preprocessor.window_stride \
+        ~model.preprocessor.window \
+        ~model.preprocessor.n_mels \
+        ~model.preprocessor.n_mfcc \
+        ~model.preprocessor.n_fft \
+        exp_manager.exp_dir=examples/asr/speech_to_label_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_label_results
 
   # L2: ASR Adapters
   L2_ASR_Adapters_Linear_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="an4" \
-            model.adapter.linear.in_features=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
-            rm -rf examples/asr/speech_to_text_adapters_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_adapters/train_asr_adapter.py \
+        model.pretrained_model="stt_en_conformer_ctc_small" \
+        model.adapter.adapter_name="an4" \
+        model.adapter.linear.in_features=176 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.max_steps=5 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_adapters_results
 
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="encoder:an4" \
-            model.adapter.adapter_type="tiny_attn" \
-            model.adapter.tiny_attn.n_feat=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
-            rm -rf examples/asr/speech_to_text_adapters_mha_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_adapters/train_asr_adapter.py \
+        model.pretrained_model="stt_en_conformer_ctc_small" \
+        model.adapter.adapter_name="encoder:an4" \
+        model.adapter.adapter_type="tiny_attn" \
+        model.adapter.tiny_attn.n_feat=176 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.max_steps=5 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_adapters_mha_results
 
   # L2: Speech Transcription
   L2_Speech_Transcription_Speech_to_Text_Transcribe:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/transcribe_speech.py \
-            pretrained_name="QuartzNet15x5Base-En" \
-            audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-            output_filename="stt_test_res.json" \
-            amp=true
-            rm -rf stt_test_res.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        pretrained_name="QuartzNet15x5Base-En" \
+        audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+        output_filename="stt_test_res.json" \
+        amp=true
+      AFTER_SCRIPT: |
+        rm -rf stt_test_res.json
 
   # L2: Transducer alignment
   L2_Transducer_alignment_Running_pytest:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
-            --DATA_DIR=/home/TestData/ctc_segmentation/eng \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
-            --LANGUAGE=en \
-            --USE_NEMO_NORMALIZATION="TRUE" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd tools/ctc_segmentation && \
+        TIME=`date +"%Y-%m-%d-%T"` && \
+        /bin/bash run_segmentation.sh \
+        --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
+        --DATA_DIR=/home/TestData/ctc_segmentation/eng \
+        --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
+        --LANGUAGE=en \
+        --USE_NEMO_NORMALIZATION="TRUE" && \
+        python /home/TestData/ctc_segmentation/verify_alignment.py \
+        -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
+        -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt;
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
 
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
-            --DATA_DIR=/home/TestData/ctc_segmentation/ru \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
-            --LANGUAGE=ru \
-            --ADDITIONAL_SPLIT_SYMBOLS=";" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd tools/ctc_segmentation && \
+        TIME=`date +"%Y-%m-%d-%T"` && \
+        /bin/bash run_segmentation.sh \
+        --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
+        --DATA_DIR=/home/TestData/ctc_segmentation/ru \
+        --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
+        --LANGUAGE=ru \
+        --ADDITIONAL_SPLIT_SYMBOLS=";" && \
+        python /home/TestData/ctc_segmentation/verify_alignment.py \
+        -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
+        -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt;
+
+        rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
 
   # L2: G2P Models
   L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
-                python g2p_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/g2p.json \
-                    validation_manifest=/home/TestData/g2p/g2p.json \
-                    model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-                    model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
-                    trainer.max_epochs=1 \
-                    model.max_source_len=64 \
-                    trainer.devices=1 \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test \
-                    --config-name=g2p_conformer_ctc && \
-                python g2p_inference.py \
-                    pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
-                    manifest_filepath=/home/TestData/g2p/g2p.json \
-                    phoneme_field=text
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/tts/g2p && \
+            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
+            python g2p_train_and_evaluate.py \
+                train_manifest=/home/TestData/g2p/g2p.json \
+                validation_manifest=/home/TestData/g2p/g2p.json \
+                model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
+                model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
+                trainer.max_epochs=1 \
+                model.max_source_len=64 \
+                trainer.devices=1 \
+                do_training=True \
+                do_testing=True \
+                exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
+                +exp_manager.use_datetime_version=False\
+                +exp_manager.version=test \
+                --config-name=g2p_conformer_ctc && \
+            python g2p_inference.py \
+                pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
+                manifest_filepath=/home/TestData/g2p/g2p.json \
+                phoneme_field=text
 
     # TODO: pleasefixme @redoctopus
     # - name: ByT5G2P training, evaluation and inference
@@ -1175,42 +848,28 @@ jobs:
 
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-                python g2p_heteronym_classification_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/manifest.json \
-                    validation_manifest=/home/TestData/g2p/manifest.json \
-                    test_manifest=/home/TestData/g2p/manifest.json \
-                    model.wordids=/home/TestData/g2p/wordids.tsv \
-                    trainer.max_epochs=1 \
-                    model.max_seq_length=64 \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test && \
-                python g2p_heteronym_classification_inference.py \
-                    manifest=/home/TestData/g2p/manifest.json \
-                    pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
-                    output_manifest=preds.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/tts/g2p && \
+            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
+            python g2p_heteronym_classification_train_and_evaluate.py \
+                train_manifest=/home/TestData/g2p/manifest.json \
+                validation_manifest=/home/TestData/g2p/manifest.json \
+                test_manifest=/home/TestData/g2p/manifest.json \
+                model.wordids=/home/TestData/g2p/wordids.tsv \
+                trainer.max_epochs=1 \
+                model.max_seq_length=64 \
+                do_training=True \
+                do_testing=True \
+                exp_manager.exp_dir=${OUTPUT_DIR} \
+                +exp_manager.use_datetime_version=False\
+                +exp_manager.version=test && \
+            python g2p_heteronym_classification_inference.py \
+                manifest=/home/TestData/g2p/manifest.json \
+                pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
+                output_manifest=preds.json
 
   # L2: Dialogue Classification
 
@@ -1258,320 +917,217 @@ jobs:
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.dataset.num_tasks=6 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-cased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+        model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
+        model.dataset.task_name=debug_sample \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.dataset.num_tasks=6 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-cased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_bert_outputs
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-            model.dataset.task=assistant \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
+        model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
+        model.dataset.task=assistant \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_bert_intent_classification_outputs
 
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-            model.dataset.task=zero_shot \
-            model.dataset.prompt_template="This example is" \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
+        model.dataset.task=zero_shot \
+        model.dataset.prompt_template="This example is" \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_zero_shot_intent_classification_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=megatron \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="This example is related to" \
+        model.library=megatron \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_zero_shot_intent_classification_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_bart_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="This example is related to" \
+        model.library=huggingface \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_zero_shot_intent_classification_bart_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_dialogue_nearest_neighbour_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="" \
+        model.library=huggingface \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_dialogue_nearest_neighbour_classification_outputs
 
   # L2: Dialogue Generation
   L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender_s2s \
-            model.dataset.task=ms_marco \
-            model.library=huggingface \
-            model.dataset.debug_mode=True \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+        model.dataset.dialogues_example_dir=answer_extender_s2s \
+        model.dataset.task=ms_marco \
+        model.library=huggingface \
+        model.dataset.debug_mode=True \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=facebook/bart-large \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf answer_extender_s2s
 
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-            model.dataset.task_name=debug_sample \
-            model.dataset.task=sgd_generation \
-            model.dataset.input_field=utterance+system_actions \
-            model.dataset.output_field=system_utterance \
-            model.dataset.use_cache=false \
-            model.dataset.system_utterance=next_turn \
-            model.dataset.debug_mode=True \
-            model.dataset.prompt_template=slots_values \
-            model.library=huggingface \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+        model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
+        model.dataset.task_name=debug_sample \
+        model.dataset.task=sgd_generation \
+        model.dataset.input_field=utterance+system_actions \
+        model.dataset.output_field=system_utterance \
+        model.dataset.use_cache=false \
+        model.dataset.system_utterance=next_turn \
+        model.dataset.debug_mode=True \
+        model.dataset.prompt_template=slots_values \
+        model.library=huggingface \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.language_model.pretrained_model_name=facebook/bart-large \
+        trainer.accelerator=gpu \
+        exp_manager=null
+      AFTER_SCRIPT: |
+        rm -rf sgd_answer_extender_s2s
 
 #     - name: L2: Dialogue Generation Part 2
 #       when {
@@ -1607,80 +1163,54 @@ jobs:
   # L2: COPY
   L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender \
-            model.library=huggingface \
-            model.dataset.task=ms_marco \
-            model.dataset.debug_mode=True \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+        model.dataset.dialogues_example_dir=answer_extender \
+        model.library=huggingface \
+        model.dataset.task=ms_marco \
+        model.dataset.debug_mode=True \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=gpt2 \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf answer_extender
 
   # L2: Duplex Text Normalization
   L2_Duplex_Text_Normalization_with_Tarred_dataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/duplex_text_normalization && \
-            python duplex_text_normalization_train.py \
-            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-            mode=tn \
-            lang=en \
-            tagger_model.do_training=false \
-            decoder_model.transformer=t5-small \
-            data.validation_ds.batch_size=2 \
-            data.train_ds.use_cache=false \
-            data.validation_ds.use_cache=false \
-            data.test_ds.batch_size=2 \
-            data.train_ds.decoder_data_augmentation=false \
-            data.train_ds.num_workers=2 \
-            decoder_trainer.devices=[0,1] \
-            decoder_trainer.accelerator="gpu" \
-            data.train_ds.use_tarred_dataset=true \
-            +decoder_trainer.fast_dev_run=true \
-            decoder_exp_manager.create_checkpoint_callback=false \
-            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-            data.test_ds.use_cache=false \
-            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/duplex_text_normalization && \
+        python duplex_text_normalization_train.py \
+        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
+        mode=tn \
+        lang=en \
+        tagger_model.do_training=false \
+        decoder_model.transformer=t5-small \
+        data.validation_ds.batch_size=2 \
+        data.train_ds.use_cache=false \
+        data.validation_ds.use_cache=false \
+        data.test_ds.batch_size=2 \
+        data.train_ds.decoder_data_augmentation=false \
+        data.train_ds.num_workers=2 \
+        decoder_trainer.devices=[0,1] \
+        decoder_trainer.accelerator="gpu" \
+        data.train_ds.use_tarred_dataset=true \
+        +decoder_trainer.fast_dev_run=true \
+        decoder_exp_manager.create_checkpoint_callback=false \
+        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
+        data.test_ds.use_cache=false \
+        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
 
 # Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
 # TODO: add when megatron bert is supported again in NeMo
@@ -1713,336 +1243,221 @@ jobs:
   # L2: BERT Text Classification
   L2_BERT_Text_Classification_with_BERT_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/text_classification && \
-            python text_classification_with_bert.py \
-            model.dataset.num_classes=6 \
-            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-            model.language_model.pretrained_model_name=distilbert-base-uncased \
-            model.train_ds.batch_size=10 \
-            model.dataset.max_seq_length=50 \
-            model.dataset.use_cache=false \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/text_classification && \
+        python text_classification_with_bert.py \
+        model.dataset.num_classes=6 \
+        model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
+        model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
+        model.language_model.pretrained_model_name=distilbert-base-uncased \
+        model.train_ds.batch_size=10 \
+        model.dataset.max_seq_length=50 \
+        model.dataset.use_cache=false \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager=null
 
   # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        # Cannot do fast_dev_run because squad needs whole dev dataset
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        # Cannot do fast_dev_run because squad needs whole dev dataset
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=facebook/bart-base \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=facebook/bart-base \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=gpt2 \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=gpt2 \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/intent_slot_classification && \
-            python intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/retail \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints
-            rm -rf checkpoints
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/intent_slot_classification && \
+        python intent_slot_classification.py \
+        model.data_dir=/home/TestData/nlp/retail \
+        model.validation_ds.prefix=dev \
+        model.test_ds.prefix=dev \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager.exp_dir=checkpoints
+      AFTER_SCRIPT: |
+        rm -rf checkpoints
 
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/intent_slot_classification && \
-            python multi_label_intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/new_multiatis \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=1 \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints2
-            rm -rf checkpoints2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/intent_slot_classification && \
+        python multi_label_intent_slot_classification.py \
+        model.data_dir=/home/TestData/nlp/new_multiatis \
+        model.validation_ds.prefix=dev \
+        model.test_ds.prefix=dev \
+        trainer.devices=1 \
+        +trainer.fast_dev_run=true \
+        exp_manager.exp_dir=checkpoints2
+      AFTER_SCRIPT: |
+        rm -rf checkpoints2
 
     # TODO: add when megatron-bert is supported again
     # stage('L2: Model Parallel Size 2 Megatron Text Classification') {
@@ -2153,342 +1568,246 @@ jobs:
   # L2: Parallel NLP Examples 2
   L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            pretrained_model=ner_en_bert \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.train_ds.batch_size=2 \
-            model.dataset.use_cache=false \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.class_balancing="weighted_loss" \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        python token_classification_train.py \
+        pretrained_model=ner_en_bert \
+        model.dataset.data_dir=/home/TestData/nlp/ner/ \
+        model.train_ds.batch_size=2 \
+        model.dataset.use_cache=false \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        model.dataset.class_balancing="weighted_loss" \
+        exp_manager.exp_dir=null
 
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              pretrained_model=punctuation_en_bert \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager.exp_dir=null && \
-            rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+        python punctuation_capitalization_train_evaluate.py \
+          pretrained_model=punctuation_en_bert \
+          model.train_ds.ds_item="${data_dir}" \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=true \
+          exp_manager.exp_dir=null;
+
+        rm -rf "${data_dir}"
 
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-        
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        python token_classification_train.py \
+        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
+        exp_manager.exp_dir=null
+
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/token_classification/token_classification_evaluate.py \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.dataset.use_cache=false \
-            pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/token_classification/token_classification_evaluate.py \
+        model.dataset.data_dir=/home/TestData/nlp/ner/ \
+        model.dataset.use_cache=false \
+        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
 
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              model.test_ds.ds_item="${data_dir}" \
-              ~model.train_ds \
-              ~model.validation_ds \
-              +model.test_ds.use_cache=false \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
-            rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+          +do_training=false \
+          +do_testing=true \
+          model.test_ds.ds_item="${data_dir}" \
+          ~model.train_ds \
+          ~model.validation_ds \
+          +model.test_ds.use_cache=false \
+          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
+
+        rm -rf "${data_dir}"
+        
 
   L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir}" \
-              model.validation_ds.ds_item="${tmp_data_dir}" \
-              model.test_ds.ds_item="${tmp_data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=true && \
-            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-            rm -rf "${tmp_data_dir}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir_2}" \
-              model.validation_ds.ds_item="${tmp_data_dir_2}" \
-              model.test_ds.ds_item="${tmp_data_dir_2}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-              "${tmp_data_dir_2}" \
-              "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${tmp_data_dir}" \
+          model.validation_ds.ds_item="${tmp_data_dir}" \
+          model.test_ds.ds_item="${tmp_data_dir}" \
+          model.language_model.pretrained_model_name=distilbert-base-uncased \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir="${output_dir}" \
+          +do_testing=true && \
+        tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
+        mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
+        rm -rf "${tmp_data_dir}" && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${tmp_data_dir_2}" \
+          model.validation_ds.ds_item="${tmp_data_dir_2}" \
+          model.test_ds.ds_item="${tmp_data_dir_2}" \
+          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          exp_manager=null;
+
+        rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
+          "${tmp_data_dir_2}" \
+          "${output_dir}"
 
   # Punctuation & Capitalization tarred dataset:
   Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-              "${data_dir}"/ && \
-            usual_data=${data_dir}/wmt_wiki_10000 && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tarred_data=${output_dir}/train_tarred && \
-            tokens_in_batch=2000 && \
-            max_seq_length=512 && \
-            lm_model=distilbert-base-uncased && \
-            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-              --text ${usual_data}/input.txt \
-              --labels ${usual_data}/labels.txt \
-              --output_dir ${tarred_data} \
-              --tokens_in_batch ${tokens_in_batch} \
-              --max_seq_length 512 \
-              --lines_per_dataset_fragment 2000 \
-              --num_batches_per_tarfile 5 \
-              --tar_file_prefix punctuation_capitalization \
-              --tokenizer_name ${lm_model} \
-              --use_fast_tokenizer \
-              --pad_label O \
-              --n_jobs 3 && \
-            echo "Number of tarred files in dataset:" && \
-            ls ${tarred_data}/*.tar | wc -l && \
-            echo "Label id files in dataset:" && \
-            ls ${tarred_data}/*.csv && \
-            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.train_ds.ds_item=${tarred_data} \
-              model.language_model.pretrained_model_name=${lm_model} \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.tar_metadata_file=${metadata_file} \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir=${output_dir}/output && \
-            rm -rf "${output_dir}" "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
+          /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
+          "${data_dir}"/ && \
+        usual_data=${data_dir}/wmt_wiki_10000 && \
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        tarred_data=${output_dir}/train_tarred && \
+        tokens_in_batch=2000 && \
+        max_seq_length=512 && \
+        lm_model=distilbert-base-uncased && \
+        python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
+          --text ${usual_data}/input.txt \
+          --labels ${usual_data}/labels.txt \
+          --output_dir ${tarred_data} \
+          --tokens_in_batch ${tokens_in_batch} \
+          --max_seq_length 512 \
+          --lines_per_dataset_fragment 2000 \
+          --num_batches_per_tarfile 5 \
+          --tar_file_prefix punctuation_capitalization \
+          --tokenizer_name ${lm_model} \
+          --use_fast_tokenizer \
+          --pad_label O \
+          --n_jobs 3 && \
+        echo "Number of tarred files in dataset:" && \
+        ls ${tarred_data}/*.tar | wc -l && \
+        echo "Label id files in dataset:" && \
+        ls ${tarred_data}/*.csv && \
+        metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
+        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          model.train_ds.ds_item=${tarred_data} \
+          model.language_model.pretrained_model_name=${lm_model} \
+          model.train_ds.use_tarred_dataset=true \
+          model.train_ds.tar_metadata_file=${metadata_file} \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir=${output_dir}/output;
+
+        rm -rf "${output_dir}" "${data_dir}"
 
   # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
   Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            label_vocab_dir="${work_dir}/labels" && \
-            mkdir -p ${label_vocab_dir} && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
-            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
-            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
-            printf "O\nU\n" > "${capit_label_vocab}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
-              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
-              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        work_dir="$(mktemp -d -p "$(pwd)")" && \
+        label_vocab_dir="${work_dir}/labels" && \
+        mkdir -p ${label_vocab_dir} && \
+        data_dir="${work_dir}/data" && \
+        mkdir -p "${data_dir}" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
+        output_dir="${work_dir}/output" && \
+        mkdir -p "${output_dir}" && \
+        punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
+        capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
+        printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
+        printf "O\nU\n" > "${capit_label_vocab}" && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${data_dir}" \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          model.language_model.pretrained_model_name=distilbert-base-uncased \
+          model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
+          model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
+          model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir="${output_dir}" \
+          +do_testing=false && \
+        python punctuation_capitalization_train_evaluate.py \
+          +do_training=false \
+          +do_testing=true \
+          ~model.train_ds \
+          ~model.validation_ds \
+          model.test_ds.ds_item="${data_dir}" \
+          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          exp_manager=null && \
+        rm -rf "${work_dir}"
+        
   # TODO: pleasefixme
   # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids:
   #   needs: [cicd-test-container-setup]
@@ -2555,670 +1874,501 @@ jobs:
   # Punctuation & Capitalization inference      
   Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            python examples/nlp/token_classification/punctuate_capitalize_infer.py \
-              --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
-              --output_text "${output_dir}/iwslt_inference_result.txt" \
-              --max_seq_length 92 \
-              --step 8 \
-              --margin 16 \
-              --pretrained_name punctuation_en_bert \
-              --batch_size 32 && \
-            rm -rf "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        python examples/nlp/token_classification/punctuate_capitalize_infer.py \
+          --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
+          --output_text "${output_dir}/iwslt_inference_result.txt" \
+          --max_seq_length 92 \
+          --step 8 \
+          --margin 16 \
+          --pretrained_name punctuation_en_bert \
+          --batch_size 32;
+        rm -rf "${output_dir}"
+
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_text_config.yaml \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=true \
-              model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
-              model.train_ds.batch_size=32 \
-              model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
-              model.validation_ds.batch_size=32 \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
-              model.optim.lr=0.01 \
-              model.optim.sched.warmup_ratio=0.1 \
-              model.tokenizer.tokenizer_name=sentencepiece \
-              model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
-              model.mask_prob=0.15 \
-              model.short_seq_prob=0.1 \
-              exp_manager.exp_dir=PretrainingBERTFromText \
-              
-            rm -f /home/TestData/nlp/wikitext-2/*.pkl
-            #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/language_modeling && \
+        python bert_pretraining.py \
+        --config-name=bert_pretraining_from_text_config.yaml \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.precision=16 \
+        +trainer.fast_dev_run=true \
+        model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
+        model.train_ds.batch_size=32 \
+        model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
+        model.validation_ds.batch_size=32 \
+        model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
+        model.optim.lr=0.01 \
+        model.optim.sched.warmup_ratio=0.1 \
+        model.tokenizer.tokenizer_name=sentencepiece \
+        model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
+        model.mask_prob=0.15 \
+        model.short_seq_prob=0.1 \
+        exp_manager.exp_dir=PretrainingBERTFromText;
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/wikitext-2/*.pkl
+        #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
 
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_preprocessed_config.yaml \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=false \
-              +trainer.max_epochs=1 \
-              +trainer.limit_val_batches=0 \
-              +trainer.limit_train_batches=1 \
-              model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
-              model.train_ds.batch_size=8 \
-              model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
-              model.optim.lr=0.875e-4 \
-              model.optim.weight_decay=0.01 \
-              model.optim.sched.warmup_ratio=0.01 \
-              exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
-              exp_manager.create_checkpoint_callback=False \
-              
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/language_modeling && \
+            python bert_pretraining.py \
+            --config-name=bert_pretraining_from_preprocessed_config.yaml \
+            trainer.devices=1 \
+            trainer.accelerator="gpu" \
+            trainer.precision=16 \
+            +trainer.fast_dev_run=false \
+            +trainer.max_epochs=1 \
+            +trainer.limit_val_batches=0 \
+            +trainer.limit_train_batches=1 \
+            model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
+            model.train_ds.batch_size=8 \
+            model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
+            model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
+            model.optim.lr=0.875e-4 \
+            model.optim.weight_decay=0.01 \
+            model.optim.sched.warmup_ratio=0.01 \
+            exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
+            exp_manager.create_checkpoint_callback=False  \
+
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
 
   # L2: Entity Linking        
   L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/entity_linking && \
-            python self_alignment_pretraining.py \
-            project_dir=. \
-            trainer.val_check_interval=3 \
-            model.raw_data=None \
-            model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-            model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-            model.train_ds.batch_size=8 \
-            model.validation_ds.batch_size=8 \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/entity_linking && \
+        python self_alignment_pretraining.py \
+        project_dir=. \
+        trainer.val_check_interval=3 \
+        model.raw_data=None \
+        model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
+        model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
+        model.train_ds.batch_size=8 \
+        model.validation_ds.batch_size=8 \
+        exp_manager.exp_dir=null
 
   # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
   # is in the release container
   # L2: NMT Attention is All You Need Training
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=false \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=2 \
-              +trainer.limit_val_batches=1 \
-              +trainer.max_steps=2 \
-              trainer.precision=16 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true
-              
-            python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=10 \
-              +trainer.limit_val_batches=1 \
-              +trainer.limit_test_batches=1 \
-              +trainer.max_steps=10 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              +exp_manager.resume_if_exists=True
-              
-            rm -rf examples/nlp/machine_translation/nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/machine_translation/enc_dec_nmt.py \
+          --config-path=conf \
+          --config-name=aayn_base \
+          do_testing=false \
+          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.encoder.num_layers=1 \
+          model.encoder.hidden_size=64 \
+          model.encoder.inner_size=256 \
+          model.decoder.num_layers=1 \
+          model.decoder.hidden_size=64 \
+          model.decoder.inner_size=256 \
+          +model.optim.capturable=True \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.val_check_interval=2 \
+          +trainer.limit_val_batches=1 \
+          +trainer.max_steps=2 \
+          trainer.precision=16 \
+          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+          +exp_manager.create_checkpoint_callback=true
+          
+        python examples/nlp/machine_translation/enc_dec_nmt.py \
+          --config-path=conf \
+          --config-name=aayn_base \
+          do_testing=true \
+          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.encoder.num_layers=1 \
+          model.encoder.hidden_size=64 \
+          model.encoder.inner_size=256 \
+          model.decoder.num_layers=1 \
+          model.decoder.hidden_size=64 \
+          model.decoder.inner_size=256 \
+          +model.optim.capturable=True \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.val_check_interval=10 \
+          +trainer.limit_val_batches=1 \
+          +trainer.limit_test_batches=1 \
+          +trainer.max_steps=10 \
+          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+          +exp_manager.create_checkpoint_callback=true \
+          +exp_manager.resume_if_exists=True
+      AFTER_SCRIPT: |    
+        rm -rf examples/nlp/machine_translation/nmt_results
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-              cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.pre_ln=true \
-              model.decoder.pre_ln=true \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_testing=true \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.encoder.pre_ln=true \
+        model.decoder.pre_ln=true \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        +trainer.limit_test_batches=2 \
+        exp_manager=null
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-              cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-              model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_testing=true \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
+        model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+        model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+        model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+        model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        +trainer.limit_test_batches=2 \
+        exp_manager=null
 
   # L2: NMT Attention is All You Need Inference
   L2_NMT_Attention_is_All_You_Need_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python nmt_transformer_infer.py \
-            --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-            --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
-            --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
-            --target_lang en \
-            --source_lang de
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python nmt_transformer_infer.py \
+        --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+        --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
+        --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
+        --target_lang en \
+        --source_lang de
 
   # L2: NMT Attention is All You Need Finetuning
   L2_NMT_Attention_is_All_You_Need_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python enc_dec_nmt_finetune.py \
-            model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-            trainer.devices=1 \
-            ~trainer.max_epochs \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            +trainer.val_check_interval=10 \
-            +trainer.limit_val_batches=1 \
-            +trainer.limit_test_batches=1 \
-            +trainer.max_steps=10 \
-            +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
-            +exp_manager.create_checkpoint_callback=True \
-            +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
-            +exp_manager.checkpoint_callback_params.mode=max \
-            +exp_manager.checkpoint_callback_params.save_best_model=true
-        
-            rm -rf examples/nlp/machine_translation/nmt_finetune
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt_finetune.py \
+        model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+        trainer.devices=1 \
+        ~trainer.max_epochs \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        +trainer.val_check_interval=10 \
+        +trainer.limit_val_batches=1 \
+        +trainer.limit_test_batches=1 \
+        +trainer.max_steps=10 \
+        +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
+        +exp_manager.create_checkpoint_callback=True \
+        +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
+        +exp_manager.checkpoint_callback_params.mode=max \
+        +exp_manager.checkpoint_callback_params.save_best_model=true
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/machine_translation/nmt_finetune
 
   # L2: NMT Tarred Dataset Creation
   L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python enc_dec_nmt.py \
-            --config-path=conf \
-            --config-name=aayn_base \
-            do_training=false \
-            model.preproc_out_dir=$PWD/preproc_out_dir \
-            model.train_ds.use_tarred_dataset=true \
-            model.train_ds.n_preproc_jobs=2 \
-            model.train_ds.lines_per_dataset_fragment=500 \
-            model.train_ds.num_batches_per_tarfile=10 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.encoder_tokenizer.vocab_size=2000 \
-            model.decoder_tokenizer.vocab_size=2000 \
-            ~model.test_ds \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_training=false \
+        model.preproc_out_dir=$PWD/preproc_out_dir \
+        model.train_ds.use_tarred_dataset=true \
+        model.train_ds.n_preproc_jobs=2 \
+        model.train_ds.lines_per_dataset_fragment=500 \
+        model.train_ds.num_batches_per_tarfile=10 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.encoder_tokenizer.vocab_size=2000 \
+        model.decoder_tokenizer.vocab_size=2000 \
+        ~model.test_ds \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager=null
 
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python create_tarred_parallel_dataset.py \
-            --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            --out_dir $PWD/out_dir \
-            --encoder_tokenizer_vocab_size=2000 \
-            --decoder_tokenizer_vocab_size=2000 \
-            --tokens_in_batch=1000 \
-            --lines_per_dataset_fragment=500 \
-            --num_batches_per_tarfile=10 \
-            --n_preproc_jobs=2 \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python create_tarred_parallel_dataset.py \
+        --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        --out_dir $PWD/out_dir \
+        --encoder_tokenizer_vocab_size=2000 \
+        --decoder_tokenizer_vocab_size=2000 \
+        --tokens_in_batch=1000 \
+        --lines_per_dataset_fragment=500 \
+        --num_batches_per_tarfile=10 \
+        --n_preproc_jobs=2
 
   L2_Megatron_NMT_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/machine_translation/megatron_nmt_training.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            +trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.train_ds.num_workers=1 \
-            model.validation_ds.num_workers=1 \
-            ~model.test_ds \
-            model.train_ds.dataset_type=text_memmap \
-            model.encoder_tokenizer.library=sentencepiece \
-            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            model.decoder_tokenizer.library=sentencepiece \
-            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-            python examples/nlp/machine_translation/megatron_nmt_training.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.train_ds.num_workers=1 \
-            model.validation_ds.num_workers=1 \
-            ~model.test_ds \
-            model.train_ds.dataset_type=text_memmap \
-            model.encoder_tokenizer.library=sentencepiece \
-            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            model.decoder_tokenizer.library=sentencepiece \
-            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-            rm -rf examples/nlp/machine_translation/megatron_nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/machine_translation/megatron_nmt_training.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        +trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.train_ds.num_workers=1 \
+        model.validation_ds.num_workers=1 \
+        ~model.test_ds \
+        model.train_ds.dataset_type=text_memmap \
+        model.encoder_tokenizer.library=sentencepiece \
+        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        model.decoder_tokenizer.library=sentencepiece \
+        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+        python examples/nlp/machine_translation/megatron_nmt_training.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.train_ds.num_workers=1 \
+        model.validation_ds.num_workers=1 \
+        ~model.test_ds \
+        model.train_ds.dataset_type=text_memmap \
+        model.encoder_tokenizer.library=sentencepiece \
+        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        model.decoder_tokenizer.library=sentencepiece \
+        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/machine_translation/megatron_nmt_results
 
   L2_Megatron_BART_Perceiver_MIM_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.arch=perceiver \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.data.data_impl=text_mmap \
-            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-            model.data.splits_string='"800,100,100"' \
-            model.data.whole_word_masking=False \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            ++model.hiddens.enc_output_name=z \
-            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-            ++model.hiddens.loss.mim.cls_name=a_mim \
-            ++model.hiddens.loss.mim.loss_weight=0.5
-            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.arch=perceiver \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.data.data_impl=text_mmap \
-            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-            model.data.splits_string='"800,100,100"' \
-            model.data.whole_word_masking=False \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            ++model.hiddens.enc_output_name=z \
-            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-            ++model.hiddens.loss.mim.cls_name=a_mim \
-            ++model.hiddens.loss.mim.loss_weight=0.5
-            rm -rf examples/nlp/language_modeling/megatron_mim_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.arch=perceiver \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.data.data_impl=text_mmap \
+        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+        model.data.splits_string='"800,100,100"' \
+        model.data.whole_word_masking=False \
+        model.tokenizer.library=sentencepiece \
+        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        ++model.hiddens.enc_output_name=z \
+        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+        ++model.hiddens.loss.mim.cls_name=a_mim \
+        ++model.hiddens.loss.mim.loss_weight=0.5
+        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.arch=perceiver \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.data.data_impl=text_mmap \
+        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+        model.data.splits_string='"800,100,100"' \
+        model.data.whole_word_masking=False \
+        model.tokenizer.library=sentencepiece \
+        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        ++model.hiddens.enc_output_name=z \
+        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+        ++model.hiddens.loss.mim.cls_name=a_mim \
+        ++model.hiddens.loss.mim.loss_weight=0.5
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/megatron_mim_results
 
     # stage('L2: NMT Bottleneck Fallback') {
     #   when {
@@ -3431,63 +2581,322 @@ jobs:
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
+
+  L2_Megatron_Bert_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.sequence_parallel=True \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
+
+  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=32 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.mcore_bert=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.sequence_parallel=True \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=32 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_bert=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
+
+  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        trainer.num_nodes=1 \
+        trainer.devices=2 \
+        trainer.precision=bf16 \
+        trainer.accelerator=gpu \
+        model.data.data_prefix=['none'] \
+        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+        model.mcore_gpt=True \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+        model.data.num_workers=4 \
+        model.micro_batch_size=1 \
+        model.data.shuffle_documents=False \
+        trainer.val_check_interval=30 \
+        +trainer.num_sanity_val_steps=0 \
+        model.init_method_std=0.023 \
+        model.optim.lr=6.0e-4 \
+        model.megatron_amp_O2=True \
+        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.dataloader_type=cyclic \
+        trainer.max_steps=10
+
+        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        trainer.num_nodes=1 \
+        trainer.devices=2 \
+        trainer.precision=bf16 \
+        trainer.accelerator=gpu \
+        model.data.data_prefix=['none'] \
+        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+        model.mcore_gpt=True \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+        model.data.num_workers=4 \
+        model.micro_batch_size=1 \
+        model.data.shuffle_documents=False \
+        trainer.val_check_interval=30 \
+        +trainer.num_sanity_val_steps=0 \
+        model.init_method_std=0.023 \
+        model.optim.lr=6.0e-4 \
+        model.megatron_amp_O2=True \
+        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.dataloader_type=cyclic \
+        trainer.max_steps=20
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/mcore_retro_results
+
+  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
+        trainer.devices=2 \
+        trainer.num_nodes=1 \
+        trainer.accelerator=gpu \
+        trainer.accumulate_grad_batches=1 \
+        trainer.limit_val_batches=2 \
+        exp_manager.resume_if_exists=True \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        trainer.val_check_interval=10 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
+        model.data.data_prefix= \
+        model.data.knn_index= \
+        model.data.retrieval_prefix= \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.chunk_size=32 \
+        model.enc_num_layers=2 \
+        model.dec_num_layers=2 \
+        model.enc_cross_attention=[1] \
+        model.dec_cross_attention=[1] \
+        +model.data.mock=True
+
+            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
             trainer.devices=2 \
+            trainer.num_nodes=1 \
             trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
             trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
+            exp_manager.resume_if_exists=True \
             trainer.max_steps=20 \
             trainer.precision=16 \
             trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
+            model.data.data_prefix= \
+            model.data.knn_index= \
+            model.data.retrieval_prefix= \
+            model.tensor_model_parallel_size=2 \
+            model.micro_batch_size=4 \
             model.optim.name=fused_adam \
             model.optim.lr=2e-4 \
             model.optim.sched.warmup_steps=2 \
@@ -3495,24 +2904,113 @@ jobs:
             model.optim.sched.min_lr=8e-5 \
             model.max_position_embeddings=128 \
             model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
-  L2_Megatron_Bert_Pretraining_and_Resume_Training:
+            model.chunk_size=32 \
+            model.enc_num_layers=2 \
+            model.dec_num_layers=2 \
+            model.enc_cross_attention=[1] \
+            model.dec_cross_attention=[1] \
+            +model.data.mock=True
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/retro_legacy_results
+
+  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
+  #               trainer.devices=2 \
+  #               trainer.num_nodes=1 \
+  #               trainer.accelerator=gpu \
+  #               trainer.accumulate_grad_batches=1 \
+  #               trainer.max_steps=100 \
+  #               trainer.log_every_n_steps=1 \
+  #               trainer.precision=16 \
+  #               trainer.val_check_interval=100 \
+  #               trainer.limit_val_batches=0 \
+  #               trainer.gradient_clip_val=1.0 \
+  #               +trainer.num_sanity_val_steps=0 \
+  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
+  #               +exp_manager.version=smalltest \
+  #               model.data.neighbors=2 \
+  #               model.megatron_amp_O2=False \
+  #               model.apply_query_key_layer_scaling=False \
+  #               model.tensor_model_parallel_size=1 \
+  #               model.optim.name=muadamw \
+  #               model.optim.weight_decay=0.1 \
+  #               model.optim.betas=[0.9,0.95] \
+  #               model.optim.lr=6e-4 \
+  #               model.optim.sched.warmup_steps=1000 \
+  #               model.optim.sched.constant_steps=0 \
+  #               model.optim.sched.min_lr=6e-5 \
+  #               model.add_position_embedding=False \
+  #               model.enc_num_layers=2 \
+  #               model.dec_num_layers=6 \
+  #               model.enc_cross_attention=[0] \
+  #               model.dec_cross_attention=[3,5] \
+  #               model.hidden_size=96 \
+  #               model.ffn_hidden_size=384 \
+  #               model.init_method_std=0.023 \
+  #               model.num_attention_heads=12 \
+  #               model.max_position_embeddings=1024 \
+  #               model.encoder_seq_length=1024 \
+  #               model.tokenizer.library=megatron \
+  #               model.tokenizer.type=GPT2BPETokenizer \
+  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
+  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
+  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
+  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
+  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
+  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
+  #               model.data.num_workers=8 \
+  #               model.micro_batch_size=8 \
+  #               model.normalization=rmsnorm \
+  #               model.transformer_block_type=pre_ln \
+  #               model.bias_activation_fusion=True \
+  #               model.bias_dropout_add_fusion=False \
+  #               model.masked_softmax_fusion=True \
+  #               model.hidden_dropout=0 \
+  #               model.attention_dropout=0 \
+  #               model.fp32_residual_connection=True \
+  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
+
+  #               python -c "import pandas as pd
+  #               import pathlib
+  #               from pandas.testing import assert_frame_equal
+  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+  #               import torch
+  #               if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
+  #                   import sys
+  #                   sys.exit(0)
+  #               event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
+  #               ea = EventAccumulator(str(event_file)).Reload()
+  #               vals = []
+  #               for i in ea.Scalars('reduced_train_loss'):
+  #                   vals.append(i.value)
+  #               training_curve = pd.DataFrame({'loss': vals})
+  #               gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
+  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
+
+  #               rm -rf examples/nlp/language_modeling/retro_results
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #         if: "failure()"
+
+  L2_RAG_Pipeline_Indexing:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3527,74 +3025,23 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
+            python examples/nlp/rag/rag_indexing.py \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            trainer.precision='bf16-mixed' \
+            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
+            indexing.embedder.embed_batch_size=128 \
+            indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \
+            indexing.data.chunk_size=256 \
+            indexing.data.chunk_overlap=10 \
+            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index'
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
+  L2_RAG_Pipeline_Generating:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3609,537 +3056,199 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=32 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=32 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=10
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=20
-
-            rm -rf examples/nlp/language_modeling/mcore_retro_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-
-            rm -rf examples/nlp/language_modeling/retro_legacy_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-  #               trainer.devices=2 \
-  #               trainer.num_nodes=1 \
-  #               trainer.accelerator=gpu \
-  #               trainer.accumulate_grad_batches=1 \
-  #               trainer.max_steps=100 \
-  #               trainer.log_every_n_steps=1 \
-  #               trainer.precision=16 \
-  #               trainer.val_check_interval=100 \
-  #               trainer.limit_val_batches=0 \
-  #               trainer.gradient_clip_val=1.0 \
-  #               +trainer.num_sanity_val_steps=0 \
-  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
-  #               +exp_manager.version=smalltest \
-  #               model.data.neighbors=2 \
-  #               model.megatron_amp_O2=False \
-  #               model.apply_query_key_layer_scaling=False \
-  #               model.tensor_model_parallel_size=1 \
-  #               model.optim.name=muadamw \
-  #               model.optim.weight_decay=0.1 \
-  #               model.optim.betas=[0.9,0.95] \
-  #               model.optim.lr=6e-4 \
-  #               model.optim.sched.warmup_steps=1000 \
-  #               model.optim.sched.constant_steps=0 \
-  #               model.optim.sched.min_lr=6e-5 \
-  #               model.add_position_embedding=False \
-  #               model.enc_num_layers=2 \
-  #               model.dec_num_layers=6 \
-  #               model.enc_cross_attention=[0] \
-  #               model.dec_cross_attention=[3,5] \
-  #               model.hidden_size=96 \
-  #               model.ffn_hidden_size=384 \
-  #               model.init_method_std=0.023 \
-  #               model.num_attention_heads=12 \
-  #               model.max_position_embeddings=1024 \
-  #               model.encoder_seq_length=1024 \
-  #               model.tokenizer.library=megatron \
-  #               model.tokenizer.type=GPT2BPETokenizer \
-  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-  #               model.data.num_workers=8 \
-  #               model.micro_batch_size=8 \
-  #               model.normalization=rmsnorm \
-  #               model.transformer_block_type=pre_ln \
-  #               model.bias_activation_fusion=True \
-  #               model.bias_dropout_add_fusion=False \
-  #               model.masked_softmax_fusion=True \
-  #               model.hidden_dropout=0 \
-  #               model.attention_dropout=0 \
-  #               model.fp32_residual_connection=True \
-  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
-
-  #               python -c "import pandas as pd
-  #               import pathlib
-  #               from pandas.testing import assert_frame_equal
-  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-  #               import torch
-  #               if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
-  #                   import sys
-  #                   sys.exit(0)
-  #               event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
-  #               ea = EventAccumulator(str(event_file)).Reload()
-  #               vals = []
-  #               for i in ea.Scalars('reduced_train_loss'):
-  #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({'loss': vals})
-  #               gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
-  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
-
-  #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  L2_BioMegatron_Bert_NER_Task:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/token_classification/token_classification_train.py \
-            exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-            trainer.max_epochs=1 \
-            model.dataset.data_dir=/home/TestData/nlp/ner \
-            model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-            model.tokenizer.tokenizer_name=null
-            rm -rf examples/nlp/language_modeling/token_classification_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-           trainer.devices=2 \
-           trainer.accelerator=gpu \
-           trainer.log_every_n_steps=1 \
-           trainer.val_check_interval=2 \
-           trainer.limit_val_batches=2 \
-           trainer.accumulate_grad_batches=1 \
-           trainer.max_steps=3 \
-           trainer.precision=16 \
-           trainer.gradient_clip_val=1.0 \
-           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-           model.tensor_model_parallel_size=2 \
-           model.optim.name=fused_adam \
-           model.optim.lr=2e-4 \
-           model.optim.sched.warmup_steps=1 \
-           model.optim.sched.constant_steps=1 \
-           model.optim.sched.min_lr=8e-5 \
-           model.max_position_embeddings=128 \
-           model.encoder_seq_length=128 \
-           model.data.seq_length=128 \
-           model.position_embedding_type=rope \
-           model.rotary_percentage=0.5 \
-           model.normalization=rmsnorm \
-           model.bias=False \
-           model.bias_activation_fusion=False \
-           model.bias_dropout_add_fusion=False \
-           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-           model.num_layers=8 \
-           model.hidden_size=256 \
-           model.num_attention_heads=8 \
-           model.activations_checkpoint_method=block \
-           model.activations_checkpoint_granularity=full \
-           model.activations_checkpoint_num_layers=1 \
-           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            #  commented out to save time on github ci @adithyare
-            # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            # trainer.devices=2 \
-            # trainer.accelerator=gpu \
-            # trainer.log_every_n_steps=1 \
-            # trainer.val_check_interval=2 \
-            # trainer.limit_val_batches=1 \
-            # trainer.accumulate_grad_batches=1 \
-            # trainer.max_steps=6 \
-            # trainer.precision=16 \
-            # trainer.gradient_clip_val=1.0 \
-            # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            # exp_manager.resume_if_exists=True \
-            # model.tensor_model_parallel_size=2 \
-            # model.optim.name=fused_adam \
-            # model.optim.lr=2e-4 \
-            # model.optim.sched.warmup_steps=2 \
-            # model.optim.sched.constant_steps=2 \
-            # model.optim.sched.min_lr=8e-5 \
-            # model.max_position_embeddings=128 \
-            # model.encoder_seq_length=128 \
-            # model.data.seq_length=128 \
-            # model.position_embedding_type=rope \
-            # model.rotary_percentage=0.5 \
-            # model.normalization=rmsnorm \
-            # model.bias=False \
-            # model.bias_activation_fusion=False \
-            # model.bias_dropout_add_fusion=False \
-            # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            # model.num_layers=8 \
-            # model.hidden_size=256 \
-            # model.num_attention_heads=8 \
-            # model.activations_checkpoint_method=block \
-            # model.activations_checkpoint_granularity=full \
-            # model.activations_checkpoint_num_layers=1 \
-            # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-
-           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-           rm -rf examples/nlp/language_modeling/gpt_index_mappings
+            python examples/nlp/rag/rag_generating.py \
+            trainer.devices=1 \
+            trainer.precision='bf16-mixed' \
+            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
+            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \
+            generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \
+            generating.inference.tokens_to_generate=50 \
+            generating.inference.greedy=False \
+            generating.inference.temperature=1.0 \
+            generating.query='Which art schools did I applied to?'
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
+  L2_BioMegatron_Bert_NER_Task:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/token_classification/token_classification_train.py \
+        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
+        trainer.max_epochs=1 \
+        model.dataset.data_dir=/home/TestData/nlp/ner \
+        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
+        model.tokenizer.tokenizer_name=null
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/token_classification_results
+
+  L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+
+  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=rope \
+        model.rotary_percentage=0.5 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+    
+        #  commented out to save time on github ci @adithyare
+        # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        # trainer.devices=2 \
+        # trainer.accelerator=gpu \
+        # trainer.log_every_n_steps=1 \
+        # trainer.val_check_interval=2 \
+        # trainer.limit_val_batches=1 \
+        # trainer.accumulate_grad_batches=1 \
+        # trainer.max_steps=6 \
+        # trainer.precision=16 \
+        # trainer.gradient_clip_val=1.0 \
+        # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        # exp_manager.resume_if_exists=True \
+        # model.tensor_model_parallel_size=2 \
+        # model.optim.name=fused_adam \
+        # model.optim.lr=2e-4 \
+        # model.optim.sched.warmup_steps=2 \
+        # model.optim.sched.constant_steps=2 \
+        # model.optim.sched.min_lr=8e-5 \
+        # model.max_position_embeddings=128 \
+        # model.encoder_seq_length=128 \
+        # model.data.seq_length=128 \
+        # model.position_embedding_type=rope \
+        # model.rotary_percentage=0.5 \
+        # model.normalization=rmsnorm \
+        # model.bias=False \
+        # model.bias_activation_fusion=False \
+        # model.bias_dropout_add_fusion=False \
+        # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        # model.num_layers=8 \
+        # model.hidden_size=256 \
+        # model.num_attention_heads=8 \
+        # model.activations_checkpoint_method=block \
+        # model.activations_checkpoint_granularity=full \
+        # model.activations_checkpoint_num_layers=1 \
+        # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+
     #  This test requires Ampere but some of the test GPUs are Volta
     #  Need to add a check for compute capability before uncommenting this test
     #  - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2
@@ -4209,1910 +3318,1551 @@ jobs:
     #      # model.optim.sched.min_lr=8e-5 \
     #      # model.max_position_embeddings=128 \
     #      # model.encoder_seq_length=128 \
-    #      # model.data.seq_length=128 \
-    #      # model.position_embedding_type=rope \
-    #      # model.rotary_percentage=0.5 \
-    #      # model.normalization=rmsnorm \
-    #      # model.bias=False \
-    #      # model.bias_activation_fusion=False \
-    #      # model.bias_dropout_add_fusion=False \
-    #      # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    #      # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    #      # model.num_layers=8 \
-    #      # model.hidden_size=256 \
-    #      # model.num_attention_heads=8 \
-    #      # model.activations_checkpoint_method=block \
-    #      # model.activations_checkpoint_granularity=full \
-    #      # model.activations_checkpoint_num_layers=1 \
-    #      # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    #      # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    #      # model.use_flash_attention=True"
-    #      rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-    #      rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-    #    }
-    #  }
-
-  L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=alibi \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            # not testing resume functionality to save time on ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.precision=16 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=alibi \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=kerple \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-            
-            # commented out to save time on github ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.precision=16 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=kerple \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-            
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=bf16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.mcore_gpt=True \
-            model.megatron_amp_O2=True \
-            model.optim.name=distributed_fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.activation=fast-swiglu \
-            model.bias_activation_fusion=False \
-            model.hidden_dropout=0.0 \
-            model.attention_dropout=0.0 \
-            model.transformer_block_type=normformer \
-            model.headscale=True \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=bf16 \
-            trainer.gradient_clip_val=1.0 \
-            model.mcore_gpt=True \
-            model.megatron_amp_O2=True \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.activation=fast-swiglu \
-            model.bias_activation_fusion=False \
-            model.hidden_dropout=0.0 \
-            model.attention_dropout=0.0 \
-            model.transformer_block_type=normformer \
-            model.headscale=True \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
-  L2_Megatron_GPT_Finetuning_PP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            +trainer.limit_val_batches=2 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.peft.peft_scheme=null \
-            model.data.train_ds.micro_batch_size=1 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.names=[quarel] \
-            model.data.validation_ds.micro_batch_size=1 \
-            model.data.validation_ds.global_batch_size=1 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.peft.peft_scheme=null \
-            model.data.train_ds.micro_batch_size=1 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.names=[quarel] \
-            model.data.validation_ds.micro_batch_size=1 \
-            model.data.validation_ds.global_batch_size=1 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_Finetuning_StarCoder_PP1:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
-            trainer.devices=1 \
-            trainer.num_nodes=1 \
-            trainer.precision=32 \
-            trainer.max_steps=4 \
-            trainer.val_check_interval=4 \
-            trainer.enable_checkpointing=False \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            exp_manager.checkpoint_callback_params.save_best_model=False \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.optim.name=distributed_fused_adam \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.num_workers=0 \
-            model.data.train_ds.concat_sampling_probabilities=[1.0]
-        
-            rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_PEFT_Lora_PP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.peft.peft_scheme=lora \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_GPT_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf /home/TestData/nlp/lora_tuning_tp2
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-            model.pipeline_model_parallel_size=1 \
-            model.tensor_model_parallel_size=2 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            model.peft.peft_scheme='lora' \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-            model.tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-            model.data.test_ds.names=['quarel4'] \
-            model.global_batch_size=2 \
-            model.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=10 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
-
-            rm -rf /home/TestData/nlp/lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    #      # model.data.seq_length=128 \
+    #      # model.position_embedding_type=rope \
+    #      # model.rotary_percentage=0.5 \
+    #      # model.normalization=rmsnorm \
+    #      # model.bias=False \
+    #      # model.bias_activation_fusion=False \
+    #      # model.bias_dropout_add_fusion=False \
+    #      # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+    #      # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+    #      # model.num_layers=8 \
+    #      # model.hidden_size=256 \
+    #      # model.num_attention_heads=8 \
+    #      # model.activations_checkpoint_method=block \
+    #      # model.activations_checkpoint_granularity=full \
+    #      # model.activations_checkpoint_num_layers=1 \
+    #      # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+    #      # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+    #      # model.use_flash_attention=True"
+    #      rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+    #      rm -rf examples/nlp/language_modeling/gpt_index_mappings"
+    #    }
+    #  }
 
-  L2_Megatron_GPT_Eval:
+  L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_eval.py \
-                gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-                prompts=['How to fix GPU memory? A:'] \
-                tensor_model_parallel_size=1 \
-                inference.tokens_to_generate=32 \
-                trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=alibi \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        # not testing resume functionality to save time on ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.precision=16 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=alibi \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  L2_Megatron_GPT_Eval_PP2:
+  L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_eval.py \
-                gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-                server=False \
-                tensor_model_parallel_size=1 \
-                pipeline_model_parallel_size=2 \
-                trainer.devices=2 \
-                trainer.num_nodes=1 \
-                trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=kerple \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        # commented out to save time on github ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.precision=16 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=kerple \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
+  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-                model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
-                model.peft.restore_from_path=null \
-                model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
-                model.data.test_ds.names=[test] \
-                model.data.test_ds.global_batch_size=1 \
-                model.data.test_ds.micro_batch_size=1 \
-                model.data.test_ds.tokens_to_generate=30 \
-                model.data.test_ds.max_seq_length=6000 \
-                model.data.test_ds.write_predictions_to_file=True \
-                model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
-                inference.greedy=True \
-                inference.repetition_penalty=1.0 \
-                inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
-                rm -rf examples/nlp/language_modeling/out.jsonl
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.mcore_gpt=True \
+        model.megatron_amp_O2=True \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.activation=fast-swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        model.mcore_gpt=True \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.activation=fast-swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-    # TODO: Add this test back. Test was failing on CI machines due to HW error
-    # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
-    #   when {
-    #     anyOf {
-    #       branch main
-    #       changeRequest target: main
-    #     }
-    #   }
-    #   failFast true
-    #   - run: |
-    #     python -m torch.distributed.launch --nproc_per_node=2 \
-    #     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
-    #     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
-    #     --checkpoint_name=model_optim_rng.pt \
-    #     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
-    #     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
-    #     --model_type=gpt \
-    #     --pipeline_model_parallel_size=1 \
-    #     --gpus_per_node=2 \
-    #     --tensor_model_parallel_size=2"
-    #     python examples/nlp/language_modeling/megatron_gpt_eval.py \
-    #     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
-    #     --tokens_to_generate=32 \
-    #     --tensor_model_parallel_size=2 \
-    #     --prompt=This is a test.
-    #     rm examples/nlp/language_modeling/small_gpt.nemo
-  
-  # L2_Megatron_Change_Partitions
-  L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
+  L2_Megatron_GPT_Finetuning_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
-                --tensor_model_parallel_size 2 \
-                --target_tensor_model_parallel_size 1 \
-                --pipeline_model_parallel_size 1 \
-                --target_pipeline_model_parallel_size 2
-
-             rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        +trainer.limit_val_batches=2 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.peft.peft_scheme=null \
+        model.data.train_ds.micro_batch_size=1 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.micro_batch_size=1 \
+        model.data.test_ds.global_batch_size=1 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.names=[quarel] \
+        model.data.validation_ds.micro_batch_size=1 \
+        model.data.validation_ds.global_batch_size=1 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.peft.peft_scheme=null \
+        model.data.train_ds.micro_batch_size=1 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.micro_batch_size=1 \
+        model.data.test_ds.global_batch_size=1 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.names=[quarel] \
+        model.data.validation_ds.micro_batch_size=1 \
+        model.data.validation_ds.global_batch_size=1 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_sft_results
 
-  L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
+  L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
-                --tensor_model_parallel_size 2 \
-                --target_tensor_model_parallel_size 4 \
-                --pipeline_model_parallel_size 1 \
-                --target_pipeline_model_parallel_size 1
-
-            rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        trainer.precision=32 \
+        trainer.max_steps=4 \
+        trainer.val_check_interval=4 \
+        trainer.enable_checkpointing=False \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        exp_manager.checkpoint_callback_params.save_best_model=False \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.peft.peft_scheme=none \
+        model.optim.name=distributed_fused_adam \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.num_workers=0 \
+        model.data.train_ds.concat_sampling_probabilities=[1.0]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_sft_results
+
+  L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=relative \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=fast-swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=relative \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=fast-swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_ir/working_dir
+
+        python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+        exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+        model.global_batch_size=4 \
+        model.micro_batch_size=4 \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=20 \
+        trainer.val_check_interval=10 \
+        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.peft.lora_tuning.adapter_dim=8 \
+        model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+        model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
+        model.data.validation_ds.write_embeddings_to_file=True \
+        model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+        python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+        model.global_batch_size=4 \
+        model.micro_batch_size=4 \
+        model.peft.lora_tuning.adapter_dim=8 \
+        model.data.test_ds.write_embeddings_to_file=True \
+        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+        model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+        model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
-  L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
+  L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=alibi \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=alibi \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.peft.peft_scheme=lora \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
 
-  L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
+  L2_Megatron_GPT_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=kerple \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=kerple \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.peft.peft_scheme='lora' \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
+        model.tensor_model_parallel_size=2 \
+        trainer.devices=2 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+        model.data.test_ds.names=['quarel4'] \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.data.test_ds.tokens_to_generate=10 \
+        model.data.test_ds.write_predictions_to_file=True \
+        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
+        inference.greedy=True \
+        inference.repetition_penalty=1.0 \
+        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2
 
-  L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
+  L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_eval.py \
+            gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
+            prompts=['How to fix GPU memory? A:'] \
+            tensor_model_parallel_size=1 \
+            inference.tokens_to_generate=32 \
+            trainer.precision=32
 
-  L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
+  L2_Megatron_GPT_Eval_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_eval.py \
+            gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+            server=False \
+            tensor_model_parallel_size=1 \
+            pipeline_model_parallel_size=2 \
             trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.num_moe_experts=4 \
-            model.decoder.num_moe_experts=4 \
-            model.encoder.moe_frequency=3 \
-            model.decoder.moe_frequency=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+            trainer.num_nodes=1 \
+            trainer.precision=32
 
-  L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
+  L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=normformer \
-            model.encoder.headscale=True \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.decoder.transformer_block_type=normformer \
-            model.decoder.headscale=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=normformer \
-            model.encoder.headscale=True \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.decoder.transformer_block_type=normformer \
-            model.decoder.headscale=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
+            model.peft.restore_from_path=null \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
+            model.data.test_ds.names=[test] \
+            model.data.test_ds.global_batch_size=1 \
+            model.data.test_ds.micro_batch_size=1 \
+            model.data.test_ds.tokens_to_generate=30 \
+            model.data.test_ds.max_seq_length=6000 \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
+            inference.greedy=True \
+            inference.repetition_penalty=1.0 \
+            inference.outfile_path=examples/nlp/language_modeling/out.jsonl 
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/out.jsonl
 
-  L2_Megatron_T5_Eval:
+    # TODO: Add this test back. Test was failing on CI machines due to HW error
+    # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
+    #   when {
+    #     anyOf {
+    #       branch main
+    #       changeRequest target: main
+    #     }
+    #   }
+    #   failFast true
+    #   - run: |
+    #     python -m torch.distributed.launch --nproc_per_node=2 \
+    #     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
+    #     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
+    #     --checkpoint_name=model_optim_rng.pt \
+    #     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
+    #     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
+    #     --model_type=gpt \
+    #     --pipeline_model_parallel_size=1 \
+    #     --gpus_per_node=2 \
+    #     --tensor_model_parallel_size=2"
+    #     python examples/nlp/language_modeling/megatron_gpt_eval.py \
+    #     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
+    #     --tokens_to_generate=32 \
+    #     --tensor_model_parallel_size=2 \
+    #     --prompt=This is a test.
+    #     rm examples/nlp/language_modeling/small_gpt.nemo
+  
+  # L2_Megatron_Change_Partitions
+  L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_eval.py \
-                --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-                --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
-                --tensor_model_parallel_size 1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
+            --tensor_model_parallel_size 2 \
+            --target_tensor_model_parallel_size 1 \
+            --pipeline_model_parallel_size 1 \
+            --target_pipeline_model_parallel_size 2
+      AFTER_SCRIPT: |
+        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
 
-  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
+  L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='reglu' \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='reglu' \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
-
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=5 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='reglu' \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='reglu' \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
-        
-            rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
+            --tensor_model_parallel_size 2 \
+            --target_tensor_model_parallel_size 4 \
+            --pipeline_model_parallel_size 1 \
+            --target_pipeline_model_parallel_size 1
+      AFTER_SCRIPT: |
+        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
 
-  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
+  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=geglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.respect_document_boundaries=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=geglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.respect_document_boundaries=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-        
-            rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=relative \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=fast-swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=relative \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=fast-swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-  # L2: Megatron T5 GLUE/XNLI Finetuning 
-  # TODO(Oktai15): update it in 1.8.0 version
-  L2_Megatron_T5_GLUE_RTE:  
+  L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=rte \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_glue_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
-  L2_Megatron_T5_GLUE_XNLI:
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=alibi \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=alibi \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
+  L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=kerple \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=kerple \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
+  L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
+  L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            -cn megatron_t5_config_finetune_glue_xnli \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=2 \
-            model.data.test_ds.micro_batch_size=2 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=xnli \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            model.data.test_ds.task_name=xnli \
-            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_xnli_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
-  L2_Megatron_T5_PEFT_Lora_TP2:
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.num_moe_experts=4 \
+        model.decoder.num_moe_experts=4 \
+        model.encoder.moe_frequency=3 \
+        model.decoder.moe_frequency=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
+  L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=normformer \
+        model.encoder.headscale=True \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.decoder.transformer_block_type=normformer \
+        model.decoder.headscale=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=normformer \
+        model.encoder.headscale=True \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.decoder.transformer_block_type=normformer \
+        model.decoder.headscale=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-            python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
-            model.pipeline_model_parallel_size=1 \
-            model.tensor_model_parallel_size=2 \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-            model.peft.peft_scheme=lora \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-            model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-            model.peft.restore_from_ckpt_name=null \
-            model.peft.restore_from_hparams_path=null \
-            model.tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-            model.data.test_ds.names=[quarel4] \
-            model.global_batch_size=2 \
-            model.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=10 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
+  L2_Megatron_T5_Eval:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_eval.py \
+            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --tensor_model_parallel_size 1
 
-            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='reglu' \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='reglu' \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=5 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='reglu' \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='reglu' \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bart_pretrain_results
+
+  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=geglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.respect_document_boundaries=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=geglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.respect_document_boundaries=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bart_pretrain_results
+
+  # L2: Megatron T5 GLUE/XNLI Finetuning 
+  # TODO(Oktai15): update it in 1.8.0 version
+  L2_Megatron_T5_GLUE_RTE:  
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+        trainer.devices=1 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+        model.pipeline_model_parallel_size=1 \
+        model.pipeline_model_parallel_split_rank=0 \
+        model.data.train_ds.task_name=rte \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.micro_batch_size=2 \
+        model.data.validation_ds.global_batch_size=2 \
+        model.data.validation_ds.micro_batch_size=2 \
+        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+        model.data.validation_ds.task_name=rte \
+        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_glue_results
+
+  L2_Megatron_T5_GLUE_XNLI:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+        -cn megatron_t5_config_finetune_glue_xnli \
+        trainer.devices=1 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+        model.pipeline_model_parallel_size=1 \
+        model.pipeline_model_parallel_split_rank=0 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.micro_batch_size=2 \
+        model.data.validation_ds.global_batch_size=2 \
+        model.data.validation_ds.micro_batch_size=2 \
+        model.data.test_ds.global_batch_size=2 \
+        model.data.test_ds.micro_batch_size=2 \
+        model.data.train_ds.task_name=rte \
+        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+        model.data.validation_ds.task_name=xnli \
+        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
+        model.data.test_ds.task_name=xnli \
+        model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_xnli_results
+
+  L2_Megatron_T5_PEFT_Lora_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
+
+        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+        model.peft.peft_scheme=lora \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+        model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
+        model.peft.restore_from_ckpt_name=null \
+        model.peft.restore_from_hparams_path=null \
+        model.tensor_model_parallel_size=2 \
+        trainer.devices=2 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+        model.data.test_ds.names=[quarel4] \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.data.test_ds.tokens_to_generate=10 \
+        model.data.test_ds.write_predictions_to_file=True \
+        model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
+        inference.greedy=True \
+        inference.repetition_penalty=1.0 \
+        inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
 
   # L2: Megatron Mock Data Generation                
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-                trainer.max_steps=10 \
-                trainer.limit_val_batches=7 \
-                trainer.val_check_interval=10 \
-                exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-                model.data.data_impl=mock \
-                model.data.data_prefix=[]
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
-  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.max_steps=10 \
-            trainer.limit_val_batches=3 \
+            trainer.limit_val_batches=7 \
             trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.mcore_gpt=True \
             model.data.data_impl=mock \
             model.data.data_prefix=[]
 
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.max_steps=10 \
+        trainer.limit_val_batches=3 \
+        trainer.val_check_interval=10 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.data.data_impl=mock \
+        model.data.data_prefix=[]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
 
   # L2: TTS Fast dev runs 1
   L2_TTS_Fast_dev_runs_1_Tacotron_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/tacotron2.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.decoder.decoder_rnn_dim=256 \
-            model.decoder.attention_rnn_dim=1024 \
-            model.decoder.prenet_dim=128 \
-            model.postnet.postnet_n_convolutions=3 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/tts/tacotron2.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.decoder.decoder_rnn_dim=256 \
+        model.decoder.attention_rnn_dim=1024 \
+        model.decoder.prenet_dim=128 \
+        model.postnet.postnet_n_convolutions=3 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs \
+        ~trainer.check_val_every_n_epoch
 
   L2_TTS_Fast_dev_runs_1_WaveGlow:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/waveglow.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.waveglow.n_flows=4 \
-            model.waveglow.n_wn_layers=2 \
-            model.waveglow.n_wn_channels=32 \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/waveglow.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.waveglow.n_flows=4 \
+        model.waveglow.n_wn_layers=2 \
+        model.waveglow.n_wn_channels=32 \
+        ~trainer.check_val_every_n_epoch
 
   L2_TTS_Fast_dev_runs_1_FastPitch:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/fastpitch.py \
-            --config-name fastpitch_align_v1.05 \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.symbols_embedding_dim=64 \
-            model.input_fft.d_inner=384 \
-            model.input_fft.n_layer=2 \
-            model.output_fft.d_inner=384 \
-            model.output_fft.n_layer=2 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_TTS_Fast_dev_runs_1_RADTTS:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/radtts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            export_dir=/home/TestData/radtts_test \
-            model.optim.lr=0.0001 \
-            model.modelConfig.decoder_use_partial_padding=True \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/fastpitch.py \
+        --config-name fastpitch_align_v1.05 \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        sup_data_path=/home/TestData/an4_dataset/beta_priors \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.pitch_mean=212.35873413085938 \
+        model.pitch_std=68.52806091308594 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.symbols_embedding_dim=64 \
+        model.input_fft.d_inner=384 \
+        model.input_fft.n_layer=2 \
+        model.output_fft.d_inner=384 \
+        model.output_fft.n_layer=2 \
+        ~trainer.check_val_every_n_epoch \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs
+
+  # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   timeout-minutes: 10
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python examples/tts/radtts.py \
+  #           train_dataset=/home/TestData/an4_dataset/an4_train.json \
+  #           validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+  #           sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
+  #           trainer.devices="[0]" \
+  #           +trainer.limit_train_batches=1 \
+  #           +trainer.limit_val_batches=1 \
+  #           trainer.max_epochs=1 \
+  #           trainer.strategy=auto \
+  #           model.pitch_mean=212.35873413085938 \
+  #           model.pitch_std=68.52806091308594 \
+  #           model.train_ds.dataloader_params.batch_size=4 \
+  #           model.train_ds.dataloader_params.num_workers=0 \
+  #           model.validation_ds.dataloader_params.batch_size=4 \
+  #           model.validation_ds.dataloader_params.num_workers=0 \
+  #           export_dir=/home/TestData/radtts_test \
+  #           model.optim.lr=0.0001 \
+  #           model.modelConfig.decoder_use_partial_padding=True \
+  #           ~trainer.check_val_every_n_epoch \
+  #           ~model.text_normalizer \
+  #           ~model.text_normalizer_call_kwargs
+  #       #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #       #  if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/mixer_tts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/sup_data \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/mixer_tts.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        sup_data_path=/home/TestData/an4_dataset/sup_data \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.pitch_mean=212.35873413085938 \
+        model.pitch_std=68.52806091308594 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        ~trainer.check_val_every_n_epoch \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs
 
   L2_TTS_Fast_dev_runs_1_Hifigan:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/hifigan.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            +trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.generator.upsample_initial_channel=64 \
-            +model.debug=true \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/hifigan.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        +trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.generator.upsample_initial_channel=64 \
+        +model.debug=true \
+        ~trainer.check_val_every_n_epoch
 
   # L2: NeRF
   # L2_NeRF_DreamFusion:
@@ -6145,44 +4895,32 @@ jobs:
 
   Speech_Checkpoints_tests:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
-                pretrained_name=QuartzNet15x5Base-En  \
-                dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
-                batch_size=64 \
-                tolerance=0.1012
-            rm -f examples/asr/evaluation_transcripts.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      TIMEOUT: 20
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
+            pretrained_name=QuartzNet15x5Base-En  \
+            dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
+            batch_size=64 \
+            tolerance=0.1012
+      AFTER_SCRIPT: |
+        rm -f examples/asr/evaluation_transcripts.json
 
   Nemo_CICD_Test:
-    needs:
-      - L0_Unit_Tests_GPU
+    needs: 
+      #- OPTIONAL_L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon
-      - L2_Community_LLM_Checkpoints_tests_Baichuan2
+      #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
       - ASR_dev_run_Speech_to_Text
       - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
       - ASR_dev_run_Speech_Pre-training_-_CitriNet
       - ASR_dev_run_Speech_To_Text_Finetuning
-      - ASR_dev_run_Speech_To_Text_HF_Finetuning
+      #- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning
       - ASR_dev_run_Speech_to_Text_WPE_-_Conformer
       - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
       - L2_Speech_to_Text_EMA
@@ -6248,6 +4986,8 @@ jobs:
       - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
       - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_Megatron_RETRO_Pretraining_and_Resume_Training
+      - L2_RAG_Pipeline_Indexing
+      - L2_RAG_Pipeline_Generating
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
@@ -6256,6 +4996,7 @@ jobs:
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
+      - L2_Megatron_GPT_Embedding 
       - L2_Megatron_GPT_PEFT_Lora_PP2
       - L2_Megatron_GPT_PEFT_Lora_TP2
       - L2_Megatron_GPT_Eval
@@ -6280,13 +5021,41 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Tacotron_2
       - L2_TTS_Fast_dev_runs_1_WaveGlow
       - L2_TTS_Fast_dev_runs_1_FastPitch
-      - L2_TTS_Fast_dev_runs_1_RADTTS
+      #- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
-
+    if: always()
     runs-on: ubuntu-latest
-    steps:
-        # This should depend on all the tests so we block/unblock based on all tests passing
-      - run: exit 0
+    steps:  
+      - if: ${{ always() }}
+        id: pipeline-conclusion
+        run: |
+          # Slack notifications are send only on test failure (not cancelled):
+          FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }}
+          echo "FAILED=$FAILED" >> $GITHUB_OUTPUT
+
+          # Mark as successful if no job was cancelled:
+          SUCCESS=${{ !contains(needs.*.result, 'cancelled') }}
+          echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT
+
+      # This should depend on all the tests so we block/unblock based on all tests passing
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
+        run: exit 0
+
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
+        name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
+        run: |
+          source .github/scripts/slackHelper.sh
+
+          WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }}
+          PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+          sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"
 
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
+        run: |
+          exit 1
diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
new file mode 100644
index 000000000000..a4b8cf3d4072
--- /dev/null
+++ b/.github/workflows/code-formatting.yml
@@ -0,0 +1,66 @@
+name: Isort and Black Formatting
+# Incrementally reformat only changed files with black, all files with isort
+#
+# Replaces pre-commit.ci, since it reformats all the files.
+# See issue https://github.com/pre-commit-ci/issues/issues/90
+#
+# The action requires a custom token to trigger workflow after pushing reformatted files back to the branch.
+# `secrets.GITHUB_TOKEN` can be used instead, but this will result
+# in not running necessary checks after reformatting, which is undesirable.
+# For details see https://github.com/orgs/community/discussions/25702
+
+on:
+  pull_request_target:
+    paths:
+      - '**.py'
+
+jobs:
+  reformat_with_isort_and_black:
+    runs-on: ubuntu-latest
+    permissions:
+      # write permissions required to commit changes
+      contents: write
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          # setup repository and ref for PRs, see
+          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          # custom token is required to trigger actions after reformatting + pushing
+          token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
+
+      # https://github.com/tj-actions/changed-files
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v44
+        with:
+          files: |
+            **.py
+
+      - name: Setup Python env
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: black
+        uses: psf/black@stable
+        with:
+          options: "--verbose"
+          # apply only to changed files (pass explicitly the files)
+          src: "${{ steps.changed-files.outputs.all_changed_files }}"
+          version: "~= 24.3"
+
+      - name: isort
+        uses: isort/isort-action@v1
+        with:
+          isort-version: "5.13.2"
+          # reformat all files with isort – safe since the whole repo is already reformatted
+          configuration: ""
+
+      - uses: EndBug/add-and-commit@v9
+        # Commit changes. Nothing is committed if no changes.
+        with:
+            message: Apply isort and black reformatting
+            commit: --signoff
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 75d1a6c51a1e..3f2213062872 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,6 +19,8 @@ ci:
   autofix_prs: true
   autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
   autoupdate_schedule: quarterly
+  # skip all hooks that can change the files, use GitHub Action "code-formatting.yml" for this
+  skip: [black,isort]
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -32,15 +34,19 @@ repos:
       - id: requirements-txt-fixer
 
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: Format imports
         exclude: docs/
 
-  - repo: https://github.com/psf/black
-    rev: 19.10b0
+  # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.3.0
     hooks:
       - id: black
-        name: Format code
-        additional_dependencies: ['click==8.0.2']
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        language_version: python3.10
diff --git a/Dockerfile.ci b/Dockerfile.ci
new file mode 100644
index 000000000000..18188f7be45f
--- /dev/null
+++ b/Dockerfile.ci
@@ -0,0 +1,75 @@
+# syntax=docker/dockerfile:1-labs
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+
+FROM ${BASE_IMAGE}
+
+ENV TRANSFORMERS_OFFLINE=0 
+ENV HYDRA_FULL_ERROR=1
+ENV PYTHONUNBUFFERED=1
+
+# APT packages
+RUN <<"EOF" bash -ex
+apt-get update
+apt-get install -y bc libsox-fmt-all -y 
+apt-get clean
+EOF
+
+WORKDIR /workspace
+
+# Install NeMo requirements
+ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
+ARG MODELOPT_VERSION=0.11.0
+ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
+ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
+RUN \
+--mount=type=bind,source=requirements,target=requirements \
+--mount=type=bind,source=tools,target=tools \
+--mount=type=bind,source=setup.py,target=setup.py \
+--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
+--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
+"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
+"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
+"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
+"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
+"llama-index==0.10.43" \
+-r tools/ctc_segmentation/requirements.txt \
+".[all]"
+
+# Megatron Core installation
+git clone https://github.com/NVIDIA/Megatron-LM.git && \
+pushd Megatron-LM && \
+git checkout ${MCORE_TAG} && \
+  pushd megatron/core/datasets && \
+  make && \
+  popd && \
+popd
+export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+EOF
+
+# Copy over NeMo code
+COPY ./ ./
+RUN <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation ".[all]"
+
+# set permission
+chmod 777 -R /workspace
+EOF
+
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+
diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index cbc52d20c41c..000000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,5912 +0,0 @@
-pipeline {
-  agent {
-        docker {
-          image 'nvcr.io/nvidia/pytorch:24.02-py3'
-          args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
-        }
-  }
-
-  environment {
-        NVTE_FUSED_ATTN = 0
-        NVTE_FLASH_ATTN = 0
-        PYTHONPATH = "/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"
-  }
-
-  options {
-    timeout(time: 8, unit: 'HOURS')
-    disableConcurrentBuilds(abortPrevious: true)
-  }
-
-  stages {
-
-    stage('Add git safe directory'){
-      steps{
-        sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH'
-        sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH'
-        sh 'git config --global --add safe.directory /mnt/D3/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH'
-      }
-    }
-
-    stage('nvidia-smi'){
-      steps{
-        sh 'nvidia-smi'
-      }
-    }
-
-    stage('PyTorch version') {
-      steps {
-        sh 'python -c "import torch; print(torch.__version__)"'
-        sh 'python -c "import torchvision; print(torchvision.__version__)"'
-      }
-    }
-
-    stage('Install test requirements') {
-      steps {
-        sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt'
-      }
-    }
-
-    stage('Code formatting checks') {
-      steps {
-        sh 'python setup.py style'
-      }
-    }
-
-    stage('Copyright Headers check') {
-      steps {
-        sh 'python tests/check_copyright_header.py --dir .'
-      }
-    }
-
-    stage('NeMo Installation') {
-      steps {
-        sh './reinstall.sh release'
-      }
-    }
-
-    stage('Transformer Engine installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
-             cd TransformerEngine && \
-             git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
-             git checkout FETCH_HEAD && \
-             git submodule init && git submodule update && \
-             NVTE_FRAMEWORK=pytorch pip install .'
-      }
-    }
-
-    stage('Apex installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/apex.git && \
-             cd apex && \
-             git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
-             cp -R apex /usr/local/lib/python3.10/dist-packages'
-      }
-    }
-
-    stage('Megatron Core installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
-             cd Megatron-LM && \
-             git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
-             pip install . && \
-             cd megatron/core/datasets && \
-             make'
-      }
-    }
-
-    stage('AMMO installation') {
-      steps {
-         sh 'pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
-      }
-    }
-
-    stage('PyTorch Lightning version') {
-      steps {
-        sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
-      }
-    }
-
-    stage('PyTorch Lightning DDP Checks') {
-      steps {
-        sh 'CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"'
-      }
-    }
-
-    stage('Basic Import Checks') {
-      steps {
-        sh 'python -c "import nemo.collections.asr as nemo_asr"'
-        sh 'python -c "import nemo.collections.nlp as nemo_nlp"'
-        sh 'python -c "import nemo.collections.tts as nemo_tts"'
-      }
-    }
-    stage('Import Checks'){
-      steps {
-        sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
-      }
-    }
-
-    stage('L0: Unit Tests GPU') {
-      steps {
-        sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
-      }
-    }
-
-   stage('L0: Unit Tests CPU') {
-     when {
-       anyOf {
-         branch 'main'
-         changeRequest target: 'main'
-       }
-     }
-     steps {
-       sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
-     }
-   }
-
-    stage('L2: Multimodal Imagen Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/imagen_train"
-        sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
-        trainer.precision=16 \
-        trainer.num_nodes=1 \
-        trainer.devices=1 \
-        ++exp_manager.max_time_per_run=00:00:03:00 \
-        trainer.max_steps=20 \
-        model.conditioning.embed_dim=64 \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.synthetic_data=True \
-        exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \
-        model.inductor=False \
-        model.unet.flash_attention=False \
-        "
-        sh "rm -rf /home/TestData/multimodal/imagen_train"
-      }
-    }
-
-    stage('L2: Multimodal Stable Diffusion Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=bf16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.optim.name=megatron_fused_adam \
-            model.data.synthetic_data=True \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \
-            model.inductor=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.unet_config.from_pretrained=null \
-            model.first_stage_config.from_pretrained=null \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            model.ddp_overlap=False \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-      }
-    }
-    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=bf16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-           model.data.synthetic_data=True \
-            model.first_stage_key=images_moments \
-            model.cond_stage_key=clip_encoded \
-            model.optim.name=megatron_fused_adam \
-            +model.optim.capturable=True \
-            exp_manager.ema.enable=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            model.inductor=False \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.first_stage_config.from_pretrained=null \
-            model.ddp_overlap=False \
-            model.capture_cudagraph_iters=15 \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-      }
-    }
-//     stage('L2: Multimodal ControlNet Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             model.data.synthetic_data=True \
-//             exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
-//             model.inductor=False \
-//             model.image_logger.max_images=0 \
-//             model.control_stage_config.params.from_pretrained_unet=null \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//       }
-//     }
-//     stage('L2: Multimodal DreamBooth Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
-//             model.inductor=False \
-//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-//             ++model.cond_stage_config.max_length=77 \
-//             ~model.cond_stage_config.restore_from_path \
-//             ~model.cond_stage_config.freeze \
-//             ~model.cond_stage_config.layer \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//       }
-//     }
-    stage('L2: Vision ViT Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-        sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=5 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.data.num_workers=0 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
-            exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
-        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-      }
-    }
-
-    stage('L2: Multimodal CLIP Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-        sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=10 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.num_workers=0 \
-            model.vision.num_layers=2 \
-            model.text.num_layers=2 \
-            model.vision.patch_dim=32 \
-            model.vision.encoder_seq_length=49 \
-            model.vision.class_token_length=7 \
-            model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-            model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-            model.data.webdataset.local_root_path=/ \
-            exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
-        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-      }
-    }
-
-    stage('L2: Multimodal NeVA Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-        sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=5 \
-            trainer.log_every_n_steps=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \
-            model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \
-            model.num_layers=2 \
-            model.hidden_size=5120 \
-            model.ffn_hidden_size=13824 \
-            model.num_attention_heads=40 \
-            model.normalization=rmsnorm \
-            model.data.num_workers=0 \
-            model.data.conv_template=llama_2 \
-            model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \
-            model.mm_cfg.llm.from_pretrained=null \
-            model.use_flash_attention=false \
-            exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
-        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-      }
-    }
-
-    stage('Setup test data and models') {
-      steps {
-        sh 'python -m tests.setup --save_dir /home/TestData/nlp'
-      }
-    }
-
-    // TODO: this requires TE >= v0.11 which is not available in 23.06.
-    //        please uncomment this test once mcore CI is ready.
-    
-
-    stage('L2: Community LLM Checkpoints tests') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Llama') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo'
-          }
-        }
-        stage('StarCoder') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo'
-          }
-        }
-        stage('Falcon') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
-          }
-        }
-        stage('Baichuan2') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
-          }
-        }
-      }
-    }
-
-    stage('L2: Nemo PTQ') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Llama2 - Export Only') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            quantization.algorithm=null \
-            model_save=/home/TestData/nlp/megatron_llama/ci_baseline'
-            sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline'
-          }
-        }
-        stage('Llama2 - INT8 SQ') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci_megatron_amp_O2_hf_tokenizer.nemo \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int8_sq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
-          }
-        }
-        stage('Llama2 - FP8') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=fp8 \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            export.inference_tensor_parallel=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results'
-            sh 'rm -rf examples/asr/speech_to_text_results'
-          }
-        }
-
-        stage('Speech to Text WPE - CitriNet') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/citrinet/" --config-name="config_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_results'
-          }
-        }
-
-        stage('Speech Pre-training - CitriNet') {
-          steps {
-            sh 'python examples/asr/speech_pretraining/speech_pre_training.py \
-            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_pre_training_results'
-            sh 'rm -rf examples/asr/speech_pre_training_results'
-          }
-        }
-
-        stage('Speech To Text Finetuning') {
-          steps {
-            sh 'python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results'
-            sh 'rm -rf examples/asr/speech_finetuning_results'
-          }
-        }
-
-        stage('Speech To Text HF Finetuning') {
-          steps {
-            sh 'python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
-            ~model.train_ds.hf_data_cfg \
-            model.train_ds.num_workers=1 \
-            model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
-            model.train_ds.streaming=true \
-            +model.train_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.train_ds.hf_data_cfg.name=null \
-            +model.train_ds.hf_data_cfg.split="test.clean" \
-            +model.train_ds.hf_data_cfg.streaming=true \
-            ~model.validation_ds.hf_data_cfg \
-            model.validation_ds.streaming=true \
-            +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.validation_ds.hf_data_cfg.name=null \
-            +model.validation_ds.hf_data_cfg.split="test.clean" \
-            +model.validation_ds.hf_data_cfg.streaming=true \
-            ~model.test_ds \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            model.optim.sched.warmup_steps=0 \
-            +model.optim.sched.max_steps=3 \
-            trainer.max_epochs=null \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results'
-            sh 'rm -rf examples/asr/speech_finetuning_results'
-          }
-        }
-
-        // TODO: Please Fix Me
-        // Error locating target 'nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder', see chained exception above.
-        // stage('L2: Speech Pre-training - Wav2Vec') {
-        //   steps {
-        //     sh 'python examples/asr/speech_pretraining/speech_pre_training.py \
-        //     --config-path="../conf/ssl/wav2vec/" --config-name="wav2vec_ci" \
-        //     model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        //     model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        //     trainer.devices=[1] \
-        //     trainer.accelerator="gpu" \
-        //     +trainer.fast_dev_run=True \
-        //     exp_manager.exp_dir=examples/asr/speech_pre_training_results'
-        //     sh 'rm -rf examples/asr/speech_pre_training_results'
-        //   }
-        // }
-
-        stage('L2: Speech to Text WPE - Conformer') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_conformer_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR dev run - part two') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Speech to Text WPE - Squeezeformer') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.encoder.d_model=144 \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: Speech to Text EMA') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      steps {
-        sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices=2 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        +exp_manager.ema.enable=True \
-        exp_manager.exp_dir=examples/asr/speech_to_text_results'
-        sh 'rm -rf examples/asr/speech_to_text_results'
-      }
-
-    }
-
-    stage('L2: Speech to Text AED') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      steps {
-        sh 'python examples/asr/speech_multitask/speech_to_text_aed.py \
-        model.prompt_format=canary \
-        model.model_defaults.asr_enc_hidden=256 \
-        model.model_defaults.lm_dec_hidden=256 \
-        model.encoder.n_layers=12 \
-        model.transf_encoder.num_layers=0 \
-        model.transf_decoder.config_dict.num_layers=12 \
-        model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
-        ++model.train_ds.is_tarred=false \
-        model.train_ds.batch_duration=60 \
-        +model.train_ds.text_field="answer" \
-        +model.train_ds.lang_field="target_lang" \
-        model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.validation_ds.text_field="answer" \
-        +model.validation_ds.lang_field="target_lang" \
-        model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.test_ds.text_field="answer" \
-        +model.test_ds.lang_field="target_lang" \
-        model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
-        model.tokenizer.langs.spl_tokens.type="bpe" \
-        model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
-        model.tokenizer.langs.en.type=bpe \
-        ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
-        ++model.tokenizer.langs.es.type=bpe \
-        trainer.devices=[0] \
-        trainer.accelerator="gpu" \
-        +trainer.use_distributed_sampler=false \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=examples/asr/speech_to_text_aed_results'
-        sh 'rm -rf examples/asr/speech_to_text_results'
-      }
-
-    }
-
-    stage('L2: Speaker dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speaker Recognition') {
-          steps {
-            sh 'python examples/speaker_tasks/recognition/speaker_reco.py \
-            model.train_ds.batch_size=10 \
-            model.validation_ds.batch_size=2 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
-            model.decoder.num_classes=2 \
-            trainer.max_epochs=10 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results'
-            sh 'rm -rf examples/speaker_tasks/recognition/speaker_recognition_results'
-          }
-        }
-
-        stage('Speaker Diarization') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
-            model.diarizer.speaker_embeddings.model_path=titanet_large \
-            model.train_ds.batch_size=5 \
-            model.validation_ds.batch_size=5 \
-            model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_results'
-          }
-        }
-
-        stage('Speech to Label') {
-          steps {
-            sh 'python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results'
-            sh 'rm -rf examples/asr/speech_to_label_results'
-          }
-        }
-
-        stage('Speaker Diarization with ASR Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
-	        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
-            diarizer.asr.model_path=QuartzNet15x5Base-En \
-            diarizer.asr.parameters.asr_based_vad=True \
-            diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results'
-          }
-        }
-
-        stage('Clustering Diarizer Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-	        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=null \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results'
-          }
-        }
-
-        stage('Neural Diarizer Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/neural_diarizer_results'
-          }
-        }
-
-        stage('Multispeaker ASR Data Simulation') {
-          steps {
-            sh 'python tools/speech_data_simulator/multispeaker_simulator.py \
-            --config-path=conf --config-name=data_simulator.yaml \
-            data_simulator.random_seed=42 \
-            data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
-            data_simulator.outputs.output_dir=./test_simulator \
-            data_simulator.session_config.num_sessions=2 \
-            data_simulator.session_config.session_length=60'
-            sh 'rm -rf ./test_simulator'
-          }
-        }
-      }
-    }
-    // TODO: Enable test after 21.08 container is used.
-    // stage('L2: ASR DALI dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') {
-    //       steps {
-    //         sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         +model.train_ds.use_dali=True \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         +model.validation_ds.use_dali=True \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_results'
-    //       }
-    //     }
-    //    stage('Speech to Text BPE - DALI AudioToMelSpectrogramPreprocessor') {
-    //       steps {
-    //         sh 'python examples/asr/asr_ctc/speech_to_text_bpe.py \
-    //         --config-path="../conf/citrinet/" --config-name="config_bpe" \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         +model.train_ds.use_dali=True \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         +model.validation_ds.use_dali=True \
-    // 	       trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_wpe_results'
-    //       }
-    //     }
-    //     // TODO: This would fail due to an unnecessary torchaudio import.
-    //     //       To be enabled once torchaudio is available in the container used for CI
-    //     // stage('Speech to Text - DALI AudioToMFCCPreprocessor') {
-    //     //   steps {
-    //     //     sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-    //     //     model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //     //     +model.train_ds.use_dali=True \
-    //     //     model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //     //     +model.validation_ds.use_dali=True \
-    //     //     model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \
-    //     //     ~model.preprocessor.normalize \
-    //     //     ~model.preprocessor.features \
-    //     //     ~model.preprocessor.frame_splicing \
-    //     //     ~model.preprocessor.dither \
-    //     //     ~model.preprocessor.stft_conv \
-    //     //     +model.n_mels=64 \
-    //     //     +model.n_mfcc=64 \
-    //     //     trainer.devices=[1] \
-    //     //     trainer.accelerator="gpu" \
-    //     //     +trainer.fast_dev_run=True \
-    //     //     exp_manager.exp_dir=examples/asr/speech_to_text_results'
-    //     //     sh 'rm -rf examples/asr/speech_to_text_results'
-    //     //   }
-    //     // }
-    //   }
-    // }
-
-    // TODO: Add back once CI is updated
-    // stage('L2: ASR RNNT dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text - RNNT') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt.py \
-    //         --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_rnnt_results'
-    //       }
-    //     }
-    //     stage('L2: Speech to Text RNNT WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py \
-    //         --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results'
-    //       }
-    //     }
-    //     stage('L3: Speech to Text Hybrid Transducer-CTC WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \
-    //         --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.encoder.n_layers= 2 \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //       }
-    //     }
-    //   }
-    // }
-
-    // stage('L2: Hybrid ASR RNNT-CTC dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text Hybrid Transducer-CTC WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \
-    //         --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.encoder.n_layers= 2 \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //       }
-    //     }
-    //   }
-    // }
-
-    stage('L2: ASR Multi-dataloader dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text multi-dataloader') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results'
-            sh 'rm -rf examples/asr/speech_to_text_results'
-          }
-        }
-
-        stage('Speech to Label multi-dataloader') {
-          steps {
-            sh 'python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results'
-            sh 'rm -rf examples/asr/speech_to_label_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR Adapters') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Linear Adapters') {
-          steps {
-            sh 'python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="an4" \
-            model.adapter.linear.in_features=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results'
-            sh 'rm -rf examples/asr/speech_to_text_adapters_results'
-          }
-        }
-        stage('RelPos MHA Adapters') {
-          steps {
-            sh 'python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="encoder:an4" \
-            model.adapter.adapter_type="tiny_attn" \
-            model.adapter.tiny_attn.n_feat=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results'
-            sh 'rm -rf examples/asr/speech_to_text_adapters_mha_results'
-          }
-        }
-
-      }
-    }
-
-    stage('L2: Speech Transcription') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text Transcribe') {
-          steps {
-            sh 'python examples/asr/transcribe_speech.py \
-            pretrained_name="QuartzNet15x5Base-En" \
-            audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-            output_filename="stt_test_res.json" \
-            amp=true'
-            sh 'rm -rf stt_test_res.json'
-          }
-        }
-      }
-    }
-    stage('L2: Transducer alignment') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Running pytest') {
-          steps {
-            sh 'pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1'
-          }
-        }
-      }
-    }
-
-    stage('L2: Segmentation Tool') {
-      when {
-            anyOf {
-              branch 'main'
-              changeRequest target: 'main'
-            }
-      }
-      stages {
-        stage('Install ctc_segmentation requirements') {
-            steps {
-            sh 'cd tools/ctc_segmentation && \
-            pip install -r requirements.txt && \
-            apt-get update && apt-get install libsox-fmt-all -y'
-            }
-        }
-
-        stage('Parallel ctc_segmentation test') {
-          failFast true
-          parallel {
-            stage('L2: Eng CitriNet with .wav') {
-              steps {
-                sh 'cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
-            --DATA_DIR=/home/TestData/ctc_segmentation/eng \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
-            --LANGUAGE=en \
-            --USE_NEMO_NORMALIZATION="TRUE" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}'
-              }
-            }
-            stage('L2: Ru QN with mp3') {
-              steps {
-                sh 'cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
-            --DATA_DIR=/home/TestData/ctc_segmentation/ru \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
-            --LANGUAGE=ru \
-            --ADDITIONAL_SPLIT_SYMBOLS=";" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}'
-              }
-            }
-          }
-        }
-      }
-    }
-
-    stage('L2: G2P Models') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('G2P Conformer training, evaluation and inference') {
-          steps {
-            sh 'cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
-                python g2p_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/g2p.json \
-                    validation_manifest=/home/TestData/g2p/g2p.json \
-                    model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-                    model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
-                    trainer.max_epochs=1 \
-                    model.max_source_len=64 \
-                    trainer.devices=[0] \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test \
-                    --config-name=g2p_conformer_ctc && \
-                python g2p_inference.py \
-                    pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
-                    manifest_filepath=/home/TestData/g2p/g2p.json \
-                    phoneme_field=text'
-              }
-            }
-            // TODO: pleasefixme @redoctopus
-            // stage('ByT5G2P training, evaluation and inference') {
-            //   steps {
-            //     sh 'cd examples/tts/g2p && \
-            //         TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
-            //         python g2p_train_and_evaluate.py \
-            //             train_manifest=/home/TestData/g2p/g2p.json \
-            //             validation_manifest=/home/TestData/g2p/g2p.json \
-            //             model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-            //             trainer.max_epochs=1 \
-            //             model.max_source_len=64 \
-            //             trainer.devices=[1] \
-            //             do_training=True \
-            //             do_testing=True \
-            //             exp_manager.exp_dir=${OUTPUT_DIR_T5} \
-            //             +exp_manager.use_datetime_version=False\
-            //             +exp_manager.version=test && \
-            //         python g2p_inference.py \
-            //             pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \
-            //             manifest_filepath=/home/TestData/g2p/g2p.json \
-            //             phoneme_field=text'
-            //   }
-            // }
-           stage('HeteronymClassificationModel training, evaluation and inference') {
-              steps {
-                sh 'cd examples/tts/g2p && \
-                    TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-                    python g2p_heteronym_classification_train_and_evaluate.py \
-                        train_manifest=/home/TestData/g2p/manifest.json \
-                        validation_manifest=/home/TestData/g2p/manifest.json \
-                        test_manifest=/home/TestData/g2p/manifest.json \
-                        model.wordids=/home/TestData/g2p/wordids.tsv \
-                        trainer.max_epochs=1 \
-                        model.max_seq_length=64 \
-                        do_training=True \
-                        do_testing=True \
-                        exp_manager.exp_dir=${OUTPUT_DIR} \
-                        +exp_manager.use_datetime_version=False\
-                        +exp_manager.version=test && \
-                    python g2p_heteronym_classification_inference.py \
-                        manifest=/home/TestData/g2p/manifest.json \
-                        pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
-                        output_manifest=preds.json'
-              }
-            }
-          }
-        }
-
-    // TODO: add test once megatron-bert is supported again
-    // stage('L2: Multi-GPU Megatron finetuning') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('L2: Cased Megatron finetuning on MRPC') {
-    //       steps {
-    //         sh 'cd examples/nlp/glue_benchmark && \
-    //     python glue_benchmark.py \
-    //     model.dataset.data_dir=/home/TestData/nlp/glue_fake/MRPC \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.use_cache=false \
-    //     model.language_model.pretrained_model_name=megatron-bert-345m-cased \
-    //     trainer.accelerator=gpu \
-    //     trainer.strategy=ddp \
-    //     exp_manager=null'
-    //       }
-    //     }
-    //   }
-    // }
-
-    stage('L2: STS-b') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('GLUE STS-b with AlBERT') {
-          steps {
-            sh 'python examples/nlp/glue_benchmark/glue_benchmark.py \
-            model.dataset.use_cache=false \
-            model.task_name=sts-b \
-            model.dataset.data_dir=/home/TestData/nlp/glue_fake/STS-B \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.language_model.pretrained_model_name=albert-base-v1 \
-            exp_manager=null'
-          }
-        }
-        stage('Test Restore Punctuation & Capitalization with AlBERT') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_albert.nemo \
-              +model.test_ds.use_cache=false \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              exp_manager=null && \
-            rm -rf "${data_dir}"'
-          }
-        }
-//         stage('Test Restore Punctuation & Capitalization with RoBERTa') {
-//           steps {
-//             sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-//             cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-//             python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-//               +do_training=false \
-//               +do_testing=true \
-//               pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_roberta.nemo \
-//               +model.test_ds.use_cache=false \
-//               ~model.train_ds \
-//               ~model.validation_ds \
-//               model.test_ds.ds_item="${data_dir}" \
-//               trainer.devices=[1] \
-//               trainer.accelerator="gpu" \
-//               exp_manager=null && \
-//             rm -rf "${data_dir}"'
-//           }
-//         }
-      }
-    }
-    stage('L2: Dialogue Classification') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Intent and slot classification using GPT') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
-            model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-            model.dataset.dialogues_example_dir=sgd_gen_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.tokenizer.special_tokens={pad_token:"endoftext"} \
-            model.tokenizer.tokenizer_name=gpt2 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-            model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_outputs'
-          }
-        }
-        stage('Intent and slot classification using SGDQA') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.dataset.num_tasks=6 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-cased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs'
-          }
-        }
-        stage('Intent and slot classification using IntentSlotClassificationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-            model.dataset.task=assistant \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_intent_classification_outputs'
-          }
-        }
-        stage('Intent classification using ZeroShotIntentModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-            model.dataset.task=zero_shot \
-            model.dataset.prompt_template="This example is" \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_zero_shot_intent_classification_outputs'
-          }
-        }
-        stage('Design Intent classification using ZeroShotIntentModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=megatron \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_outputs'
-          }
-        }
-        stage('Design Intent classification using ZeroShotIntentModel BART Classifier') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=huggingface \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_bart_outputs'
-          }
-        }
-        stage('Design Intent classification using DialogueNearestNeighbourModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="" \
-            model.library=huggingface \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_dialogue_nearest_neighbour_classification_outputs'
-          }
-        }
-      }
-    }
-    stage('L2: Dialogue Generation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender_s2s \
-            model.dataset.task=ms_marco \
-            model.library=huggingface \
-            model.dataset.debug_mode=True \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender_s2s'
-          }
-        }
-        stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-            model.dataset.task_name=debug_sample \
-            model.dataset.task=sgd_generation \
-            model.dataset.input_field=utterance+system_actions \
-            model.dataset.output_field=system_utterance \
-            model.dataset.use_cache=false \
-            model.dataset.system_utterance=next_turn \
-            model.dataset.debug_mode=True \
-            model.dataset.prompt_template=slots_values \
-            model.library=huggingface \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_answer_extender_s2s'
-          }
-        }
-      }
-    }
-//     stage('L2: Dialogue Generation Part 2') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       parallel {
-//         stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
-//           steps {
-//             sh 'cd examples/nlp/dialogue && \
-//             python dialogue.py \
-//             do_training=False \
-//             model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-//             model.dataset.dialogues_example_dir=answer_extender \
-//             model.library=huggingface \
-//             model.dataset.task=ms_marco \
-//             model.dataset.debug_mode=True \
-//             trainer.val_check_interval=0.0 \
-//             trainer.devices=[0] \
-//             model.dataset.use_cache=false \
-//             model.language_model.pretrained_model_name=gpt2 \
-//             trainer.accelerator=gpu \
-//             exp_manager=null  && \
-//             rm -rf answer_extender'
-//           }
-//         }
-//       }
-//     }
-    stage('L2: COPY') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender \
-            model.library=huggingface \
-            model.dataset.task=ms_marco \
-            model.dataset.debug_mode=True \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender'
-          }
-        }
-      }
-    }
-    stage('L2: Duplex Text Normalization') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Duplex Text Normalization with Tarred dataset') {
-          steps {
-            sh 'cd examples/nlp/duplex_text_normalization && \
-            python duplex_text_normalization_train.py \
-            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-            mode=tn \
-            lang=en \
-            tagger_model.do_training=false \
-            decoder_model.transformer=t5-small \
-            data.validation_ds.batch_size=2 \
-            data.train_ds.use_cache=false \
-            data.validation_ds.use_cache=false \
-            data.test_ds.batch_size=2 \
-            data.train_ds.decoder_data_augmentation=false \
-            data.train_ds.num_workers=2 \
-            decoder_trainer.devices=[0,1] \
-            decoder_trainer.accelerator="gpu" \
-            data.train_ds.use_tarred_dataset=true \
-            +decoder_trainer.fast_dev_run=true \
-            decoder_exp_manager.create_checkpoint_callback=false \
-            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-            data.test_ds.use_cache=false \
-            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv'
-          }
-        }
-      }
-    }
-    // Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
-    // TODO: add when megatron bert is supported again in NeMo
-    // stage('L2: MegaBERT Token Classification') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh 'cd examples/nlp/token_classification && \
-    //     python token_classification_train.py \
-    //     model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-    //     model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
-    //     model.train_ds.batch_size=10 \
-    //     model.dataset.max_seq_length=50 \
-    //     model.dataset.use_cache=false \
-    //     trainer.accelerator=gpu \
-    //     trainer.strategy=ddp \
-    //     trainer.precision=16 \
-    //     trainer.devices=[1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    stage('L2: BERT Text Classification') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('Text Classification with BERT Test') {
-          steps {
-            sh 'cd examples/nlp/text_classification && \
-            python text_classification_with_bert.py \
-            model.dataset.num_classes=6 \
-            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-            model.language_model.pretrained_model_name=distilbert-base-uncased \
-            model.train_ds.batch_size=10 \
-            model.dataset.max_seq_length=50 \
-            model.dataset.use_cache=false \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('BERT SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('BERT SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('BART SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('BART SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('GPT2 SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('GPT2 SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Intent and Slot Classification Tasks') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Intent and Slot Classification') {
-          steps {
-            sh 'cd examples/nlp/intent_slot_classification && \
-            python intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/retail \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints'
-            sh 'rm -rf checkpoints'
-          }
-        }
-        stage('L2: Multi-Label Intent and Slot Classification') {
-          steps {
-            sh 'cd examples/nlp/intent_slot_classification && \
-            python multi_label_intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/new_multiatis \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=[0] \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints2'
-            sh 'rm -rf checkpoints2'
-          }
-        }
-      }
-    }
-
-    // TODO: add when megatron-bert is supported again
-    // stage('L2: Model Parallel Size 2 Megatron Text Classification') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python text_classification_with_bert.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.num_classes=6 \
-    //     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    //     model.train_ds.batch_size=4 \
-    //     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    //     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    //     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    //     model.nemo_path=null \
-    //     ~model.infer_samples \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Autoresume') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python text_classification_with_bert.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     trainer.max_epochs=1 \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.num_classes=6 \
-    //     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    //     model.train_ds.batch_size=4 \
-    //     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    //     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    //     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    //     model.nemo_path=null \
-    //     ~model.infer_samples \
-    //     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    //     +exp_manager.resume_if_exists=true'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python model_parallel_text_classification_evaluation.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     model.dataset.num_classes=6 \
-    //     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    //     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/token_classification && \
-    //     python token_classification_train.py \
-    //     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    //     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    //     model.train_ds.batch_size=2 \
-    //     model.dataset.use_cache=false \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.class_balancing="weighted_loss" \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    stage('L2: Parallel NLP Examples 2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('NER finetuning from pretrained Test') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            pretrained_model=ner_en_bert \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.train_ds.batch_size=2 \
-            model.dataset.use_cache=false \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.class_balancing="weighted_loss" \
-            exp_manager.exp_dir=null'
-          }
-        }
-        stage ('Punctuation and capitalization finetuning from pretrained test') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              pretrained_model=punctuation_en_bert \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager.exp_dir=null && \
-            rm -rf "${data_dir}"'
-          }
-        }
-        stage ('NER with TurkuNLP/bert-base-finnish-cased-v1') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-            exp_manager.exp_dir=null'
-          }
-        }
-        stage('Evaluation script for Token Classification') {
-          steps {
-            sh 'python examples/nlp/token_classification/token_classification_evaluate.py \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.dataset.use_cache=false \
-            pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo'
-          }
-        }
-        stage('Evaluation script for Punctuation') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              model.test_ds.ds_item="${data_dir}" \
-              ~model.train_ds \
-              ~model.validation_ds \
-              +model.test_ds.use_cache=false \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
-            rm -rf "${data_dir}"'
-          }
-        }
-        stage('L2: Punctuation & Capitalization, 2GPUs with DistilBERT, Fine-tuning on different data') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir}" \
-              model.validation_ds.ds_item="${tmp_data_dir}" \
-              model.test_ds.ds_item="${tmp_data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=true && \
-            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-            rm -rf "${tmp_data_dir}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir_2}" \
-              model.validation_ds.ds_item="${tmp_data_dir_2}" \
-              model.test_ds.ds_item="${tmp_data_dir_2}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-              "${tmp_data_dir_2}" \
-              "${output_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('Punctuation & Capitalization tarred dataset') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('create and use tarred dataset') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-              "${data_dir}"/ && \
-            usual_data=${data_dir}/wmt_wiki_10000 && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tarred_data=${output_dir}/train_tarred && \
-            tokens_in_batch=2000 && \
-            max_seq_length=512 && \
-            lm_model=distilbert-base-uncased && \
-            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-              --text ${usual_data}/input.txt \
-              --labels ${usual_data}/labels.txt \
-              --output_dir ${tarred_data} \
-              --tokens_in_batch ${tokens_in_batch} \
-              --max_seq_length 512 \
-              --lines_per_dataset_fragment 2000 \
-              --num_batches_per_tarfile 5 \
-              --tar_file_prefix punctuation_capitalization \
-              --tokenizer_name ${lm_model} \
-              --use_fast_tokenizer \
-              --pad_label O \
-              --n_jobs 3 && \
-            echo "Number of tarred files in dataset:" && \
-            ls ${tarred_data}/*.tar | wc -l && \
-            echo "Label id files in dataset:" && \
-            ls ${tarred_data}/*.csv && \
-            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.train_ds.ds_item=${tarred_data} \
-              model.language_model.pretrained_model_name=${lm_model} \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.tar_metadata_file=${metadata_file} \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir=${output_dir}/output && \
-            rm -rf "${output_dir}" "${data_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('Punctuation & Capitalization, Different ways of passing labels to model') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            label_vocab_dir="${work_dir}/labels" && \
-            mkdir -p ${label_vocab_dir} && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
-            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
-            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
-            printf "O\nU\n" > "${capit_label_vocab}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
-              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
-              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"'
-          }
-        }
-        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            conf_name=punctuation_capitalization_config_with_ids && \
-            cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \
-            sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
-              "${work_dir}/${conf_name}.yaml" && \
-            sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
-              "${work_dir}/${conf_name}.yaml" && \
-            python punctuation_capitalization_train_evaluate.py \
-              --config-path "${work_dir}" \
-              --config-name "${conf_name}" \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"'
-          }
-        }
-      }
-    }
-    stage('Punctuation & Capitalization inference') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('Restore punctuation and capitalization in long text') {
-          steps {
-            sh 'output_dir="$(mktemp -d -p "$(pwd)")" && \
-            python examples/nlp/token_classification/punctuate_capitalize_infer.py \
-              --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
-              --output_text "${output_dir}/iwslt_inference_result.txt" \
-              --max_seq_length 92 \
-              --step 8 \
-              --margin 16 \
-              --pretrained_name punctuation_en_bert \
-              --batch_size 32 && \
-            rm -rf "${output_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Pretraining BERT pretraining from Text') {
-            steps {
-              sh 'cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_text_config.yaml \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=true \
-              model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
-              model.train_ds.batch_size=32 \
-              model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
-              model.validation_ds.batch_size=32 \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
-              model.optim.lr=0.01 \
-              model.optim.sched.warmup_ratio=0.1 \
-              model.tokenizer.tokenizer_name=sentencepiece \
-              model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
-              model.mask_prob=0.15 \
-              model.short_seq_prob=0.1 \
-              exp_manager.exp_dir=PretrainingBERTFromText \
-              '
-              sh 'rm -f /home/TestData/nlp/wikitext-2/*.pkl'
-              sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromText'
-              sh 'ls -lha examples/nlp/language_modeling'
-            }
-        }
-        stage('L2: Pretraining BERT from Preprocessed') {
-            steps {
-              sh 'cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_preprocessed_config.yaml \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=false \
-              +trainer.max_epochs=1 \
-              +trainer.limit_val_batches=0 \
-              +trainer.limit_train_batches=1 \
-              model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
-              model.train_ds.batch_size=8 \
-              model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
-              model.optim.lr=0.875e-4 \
-              model.optim.weight_decay=0.01 \
-              model.optim.sched.warmup_ratio=0.01 \
-              exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
-              exp_manager.create_checkpoint_callback=False \
-              '
-              sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed'
-              sh 'ls -lha examples/nlp/language_modeling'
-            }
-        }
-      }
-    }
-
-    stage('L2: Entity Linking') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('Self Alignment Pretraining BERT') {
-           steps {
-             sh 'cd examples/nlp/entity_linking && \
-             python self_alignment_pretraining.py \
-             project_dir=. \
-             trainer.val_check_interval=3 \
-             model.raw_data=None \
-             model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-             model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-             model.train_ds.batch_size=8 \
-             model.validation_ds.batch_size=8 \
-             exp_manager.exp_dir=null'
-          }
-        }
-      }
-    }
-
-    // TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
-    // is in the release container
-    stage('L2: NMT Attention is All You Need Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: NMT Training Post-LN') {
-            steps {
-              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=false \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=2 \
-              +trainer.limit_val_batches=1 \
-              +trainer.max_steps=2 \
-              trainer.precision=16 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              '
-              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=10 \
-              +trainer.limit_val_batches=1 \
-              +trainer.limit_test_batches=1 \
-              +trainer.max_steps=10 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              +exp_manager.resume_if_exists=True \
-              '
-              sh 'rm -rf examples/nlp/machine_translation/nmt_results'
-            }
-        }
-
-        stage('L2: NMT Training Pre-LN') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.pre_ln=true \
-              model.decoder.pre_ln=true \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null \
-              '
-            }
-        }
-        stage('L2: NMT Multi-Validation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-              model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null \
-              '
-            }
-        }
-      }
-    }
-
-    stage('L2: NMT Attention is All You Need Inference') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh 'cd examples/nlp/machine_translation && \
-        python nmt_transformer_infer.py \
-        --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
-        --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
-        --target_lang en \
-        --source_lang de \
-        '
-      }
-    }
-
-    stage('L2: NMT Attention is All You Need Finetuning') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "cd examples/nlp/machine_translation && \
-        python enc_dec_nmt_finetune.py \
-        model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        trainer.devices=[0] \
-        ~trainer.max_epochs \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        +trainer.val_check_interval=10 \
-        +trainer.limit_val_batches=1 \
-        +trainer.limit_test_batches=1 \
-        +trainer.max_steps=10 \
-        +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
-        +exp_manager.create_checkpoint_callback=True \
-        +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
-        +exp_manager.checkpoint_callback_params.mode=max \
-        +exp_manager.checkpoint_callback_params.save_best_model=true \
-        "
-        sh "rm -rf examples/nlp/machine_translation/nmt_finetune"
-      }
-    }
-
-
-    stage('L2: NMT Tarred Dataset Creation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: NMT Auto Tarred Dataset Creation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_training=false \
-              model.preproc_out_dir=$PWD/preproc_out_dir \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.n_preproc_jobs=2 \
-              model.train_ds.lines_per_dataset_fragment=500 \
-              model.train_ds.num_batches_per_tarfile=10 \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.vocab_size=2000 \
-              model.decoder_tokenizer.vocab_size=2000 \
-              ~model.test_ds \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager=null \
-              '
-            }
-        }
-
-        stage('L2: NMT Script Tarred Dataset Creation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python create_tarred_parallel_dataset.py \
-              --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              --out_dir $PWD/out_dir \
-              --encoder_tokenizer_vocab_size=2000 \
-              --decoder_tokenizer_vocab_size=2000 \
-              --tokens_in_batch=1000 \
-              --lines_per_dataset_fragment=500 \
-              --num_batches_per_tarfile=10 \
-              --n_preproc_jobs=2 \
-              '
-            }
-        }
-      }
-    }
-    // stage('L2: Megatron NMT Training TP=2') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=10 \
-    //     +trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=10 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.seq_length=128 \
-    //     model.encoder.num_layers=4 \
-    //     model.encoder.hidden_size=64 \
-    //     model.encoder.num_attention_heads=8 \
-    //     model.encoder.activation='swiglu' \
-    //     model.encoder.masked_softmax_fusion=False \
-    //     model.encoder.bias_activation_fusion=False \
-    //     model.encoder.activations_checkpoint_method='block' \
-    //     model.encoder.activations_checkpoint_num_layers=1 \
-    //     model.decoder.num_layers=2 \
-    //     model.decoder.hidden_size=64 \
-    //     model.decoder.num_attention_heads=8 \
-    //     model.decoder.activation='swiglu' \
-    //     model.decoder.masked_softmax_fusion=False \
-    //     model.decoder.bias_activation_fusion=False \
-    //     model.decoder.activations_checkpoint_method='block' \
-    //     model.decoder.activations_checkpoint_num_layers=1 \
-    //     model.micro_batch_size=2 \
-    //     model.global_batch_size=4 \
-    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.train_ds.num_workers=1 \
-    //     model.validation_ds.num_workers=1 \
-    //     ~model.test_ds \
-    //     model.train_ds.dataset_type=text_memmap \
-    //     model.encoder_tokenizer.library=sentencepiece \
-    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-    //     model.decoder_tokenizer.library=sentencepiece \
-    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-    //     // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-    //     // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=1 \
-    //     +trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=10 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.seq_length=128 \
-    //     model.encoder.num_layers=4 \
-    //     model.encoder.hidden_size=64 \
-    //     model.encoder.num_attention_heads=8 \
-    //     model.encoder.activation='swiglu' \
-    //     model.encoder.masked_softmax_fusion=False \
-    //     model.encoder.bias_activation_fusion=False \
-    //     model.encoder.activations_checkpoint_method='block' \
-    //     model.encoder.activations_checkpoint_num_layers=1 \
-    //     model.decoder.num_layers=2 \
-    //     model.decoder.hidden_size=64 \
-    //     model.decoder.num_attention_heads=8 \
-    //     model.decoder.activation='swiglu' \
-    //     model.decoder.masked_softmax_fusion=False \
-    //     model.decoder.bias_activation_fusion=False \
-    //     model.decoder.activations_checkpoint_method='block' \
-    //     model.decoder.activations_checkpoint_num_layers=1 \
-    //     model.micro_batch_size=2 \
-    //     model.global_batch_size=4 \
-    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.train_ds.num_workers=1 \
-    //     model.validation_ds.num_workers=1 \
-    //     ~model.test_ds \
-    //     model.train_ds.dataset_type=text_memmap \
-    //     model.encoder_tokenizer.library=sentencepiece \
-    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-    //     model.decoder_tokenizer.library=sentencepiece \
-    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-    //     sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results"
-    //   }
-    // }
-    stage('L2: Megatron BART Perceiver MIM Training TP=2') {
-      // Testing Megatron hidden transformations
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string=\'\"800,100,100\"\' \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5"
-        // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string=\'\"800,100,100\"\' \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5"
-        sh "rm -rf examples/nlp/language_modeling/megatron_mim_results"
-      }
-    }
-    // stage('L2: NMT Bottleneck Fallback') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('L2: seq2seq (no bottleneck)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=seq2seq \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    //           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    //           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    // stage('L2: NMT Bottleneck Architecture') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Bridge Encoder (identity)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=bridge \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=identity \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //		 trainer.devices=[0] \
-    // 		 trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //     stage('Perceiver Encoder (params)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    // stage('L2: NMT Bottleneck LVM') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('VAE') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=vae \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[0] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //     stage('MIM') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=mim \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: Megatron Bert Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: Megatron Core Bert Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: NeMo Bert Embedding Finetuning and Resume') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=12 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=False \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=False"
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=36 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=False \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=True"
-        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
-      }
-    }
-    stage('L2: Megatron Core Bert Embedding Finetuning and Resume') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=12 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=36 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=True \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=False"
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=16 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=True \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=True"
-        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
-      }
-    }
-    stage('L2: Megatron RETRO Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=10"
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=20"
-        sh "rm -rf examples/nlp/language_modeling/mcore_retro_results"
-      }
-    }
-    stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix='' \
-        model.data.knn_index='' \
-        model.data.retrieval_prefix='' \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True"
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix='' \
-        model.data.knn_index='' \
-        model.data.retrieval_prefix='' \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True"
-        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
-      }
-    }
-    stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-            sh "python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-                trainer.devices=2 \
-                trainer.num_nodes=1 \
-                trainer.accelerator=gpu \
-                trainer.accumulate_grad_batches=1 \
-                trainer.max_steps=100 \
-                trainer.log_every_n_steps=1 \
-                trainer.precision=16 \
-                trainer.val_check_interval=100 \
-                trainer.limit_val_batches=0 \
-                trainer.gradient_clip_val=1.0 \
-                +trainer.num_sanity_val_steps=0 \
-                exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \
-                +exp_manager.version=smalltest \
-                model.data.neighbors=2 \
-                model.megatron_amp_O2=False \
-                model.apply_query_key_layer_scaling=False \
-                model.tensor_model_parallel_size=1 \
-                model.optim.name=muadamw \
-                model.optim.weight_decay=0.1 \
-                model.optim.betas=[0.9,0.95] \
-                model.optim.lr=6e-4 \
-                model.optim.sched.warmup_steps=1000 \
-                model.optim.sched.constant_steps=0 \
-                model.optim.sched.min_lr=6e-5 \
-                model.add_position_embedding=False \
-                model.enc_num_layers=2 \
-                model.dec_num_layers=6 \
-                model.enc_cross_attention=[0] \
-                model.dec_cross_attention=[3,5] \
-                model.hidden_size=96 \
-                model.ffn_hidden_size=384 \
-                model.init_method_std=0.023 \
-                model.num_attention_heads=12 \
-                model.max_position_embeddings=1024 \
-                model.encoder_seq_length=1024 \
-                model.tokenizer.library=megatron \
-                model.tokenizer.type=GPT2BPETokenizer \
-                model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-                model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-                model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-                model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-                model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-                model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-                model.data.num_workers=8 \
-                model.micro_batch_size=8 \
-                model.normalization=rmsnorm \
-                model.transformer_block_type=pre_ln \
-                model.bias_activation_fusion=True \
-                model.bias_dropout_add_fusion=False \
-                model.masked_softmax_fusion=True \
-                model.hidden_dropout=0 \
-                model.attention_dropout=0 \
-                model.fp32_residual_connection=True \
-                model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml"
-        sh '''python -c "import pandas as pd
-import pathlib
-from pandas.testing import assert_frame_equal
-from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-import torch
-if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
-    import sys
-    sys.exit(0)
-event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
-ea = EventAccumulator(str(event_file)).Reload()
-vals = []
-for i in ea.Scalars('reduced_train_loss'):
-    vals.append(i.value)
-training_curve = pd.DataFrame({'loss': vals})
-gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
-assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
-        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
-      }
-    }
-    stage('L2: BioMegatron Bert NER Task') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null"
-        sh "rm -rf examples/nlp/language_modeling/token_classification_results"
-      }
-    }
-    stage('L2: Megatron GPT Pretraining and Resume Training TETransformerLayerTP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    // @athitten: Revert limit_val_batches to 2 until limit_val_batches 1.0 leading to no validation is fixed for non DictConfig data_prefix
-    stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-
-    stage('L2: Megatron GPT Pretraining and Resume Training TP=2 with Torch Distributed Checkpoint') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.mcore_gpt=True \
-        model.torch_distributed_checkpoint=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.transformer_engine=true \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_gpt=True \
-        model.torch_distributed_checkpoint=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.transformer_engine=True \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-/*
-    stage('L2: Megatron GPT Pretraining with EP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.expert_model_parallel_size=2 \
-        ++model.num_moe_experts=2 \
-        ++model.moe_router_topk=1 \
-        ++model.megatron_amp_O2=True \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-*/
-    stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') {
-     when {
-       anyOf {
-         branch 'main'
-         changeRequest target: 'main'
-       }
-     }
-     failFast true
-     steps {
-       sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-       trainer.devices=2 \
-       trainer.accelerator=gpu \
-       trainer.log_every_n_steps=1 \
-       trainer.val_check_interval=2 \
-       trainer.limit_val_batches=2 \
-       trainer.accumulate_grad_batches=1 \
-       trainer.max_steps=3 \
-       trainer.precision=16 \
-       trainer.gradient_clip_val=1.0 \
-       exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-       model.tensor_model_parallel_size=2 \
-       model.optim.name=fused_adam \
-       model.optim.lr=2e-4 \
-       model.optim.sched.warmup_steps=1 \
-       model.optim.sched.constant_steps=1 \
-       model.optim.sched.min_lr=8e-5 \
-       model.max_position_embeddings=128 \
-       model.encoder_seq_length=128 \
-       model.data.seq_length=128 \
-       model.position_embedding_type=rope \
-       model.rotary_percentage=0.5 \
-       model.normalization=rmsnorm \
-       model.transformer_engine=True \
-       model.bias=False \
-       model.bias_activation_fusion=False \
-       model.bias_dropout_add_fusion=False \
-       model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-       model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-       model.num_layers=8 \
-       model.cpu_offloading_num_layers=7 \
-       model.hidden_size=256 \
-       model.num_attention_heads=8 \
-       model.activations_checkpoint_method='block' \
-       model.activations_checkpoint_granularity='full' \
-       model.activations_checkpoint_num_layers=1 \
-       model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],validation:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],test:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]}' \
-       model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // commented out to save time on github ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=rope \
-        //model.rotary_percentage=0.5 \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-       sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-       sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-     }
-
-    // This test requires Ampere but some of the test GPUs are Volta
-    // Need to add a check for compute capability before uncommenting this test
-    // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=2 \
-    //     trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=3 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.optim.name=fused_adam \
-    //     model.optim.lr=2e-4 \
-    //     model.optim.sched.warmup_steps=1 \
-    //     model.optim.sched.constant_steps=1 \
-    //     model.optim.sched.min_lr=8e-5 \
-    //     model.max_position_embeddings=128 \
-    //     model.encoder_seq_length=128 \
-    //     model.data.seq_length=128 \
-    //     model.position_embedding_type=rope \
-    //     model.rotary_percentage=0.5 \
-    //     model.normalization=rmsnorm \
-    //     model.bias=False \
-    //     model.bias_activation_fusion=False \
-    //     model.bias_dropout_add_fusion=False \
-    //     model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    //     model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    //     model.num_layers=8 \
-    //     model.hidden_size=256 \
-    //     model.num_attention_heads=8 \
-    //     model.activations_checkpoint_method='block' \
-    //     model.activations_checkpoint_granularity='full' \
-    //     model.activations_checkpoint_num_layers=1 \
-    //     model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    //     model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    //     model.use_flash_attention=True "
-    //     // commented out to save time on github ci @adithyare
-    //     //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    //     //trainer.devices=2 \
-    //     //trainer.accelerator=gpu \
-    //     //trainer.log_every_n_steps=1 \
-    //     //trainer.val_check_interval=2 \
-    //     //trainer.limit_val_batches=1 \
-    //     //trainer.accumulate_grad_batches=1 \
-    //     //trainer.max_steps=6 \
-    //     //trainer.precision=16 \
-    //     //trainer.gradient_clip_val=1.0 \
-    //     //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    //     //exp_manager.resume_if_exists=True \
-    //     //model.tensor_model_parallel_size=2 \
-    //     //model.optim.name=fused_adam \
-    //     //model.optim.lr=2e-4 \
-    //     //model.optim.sched.warmup_steps=2 \
-    //     //model.optim.sched.constant_steps=2 \
-    //     //model.optim.sched.min_lr=8e-5 \
-    //     //model.max_position_embeddings=128 \
-    //     //model.encoder_seq_length=128 \
-    //     //model.data.seq_length=128 \
-    //     //model.position_embedding_type=rope \
-    //     //model.rotary_percentage=0.5 \
-    //     //model.normalization=rmsnorm \
-    //     //model.bias=False \
-    //     //model.bias_activation_fusion=False \
-    //     //model.bias_dropout_add_fusion=False \
-    //     //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    //     //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    //     //model.num_layers=8 \
-    //     //model.hidden_size=256 \
-    //     //model.num_attention_heads=8 \
-    //     //model.activations_checkpoint_method='block' \
-    //     //model.activations_checkpoint_granularity='full' \
-    //     //model.activations_checkpoint_num_layers=1 \
-    //     //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    //     //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    //     //model.use_flash_attention=True"
-    //     sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-    //     sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-    //   }
-    // }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=alibi \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // not testing resume functionality to save time on ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=alibi \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=kerple \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // commented out to save time on github ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=kerple \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    stage('L2: Megatron GPT Finetuning PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.validation_ds.names=[quarel,trec]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.validation_ds.names=[quarel,trec]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
-      }
-    }
-    stage('L2: Megatron GPT Finetuning StarCoder PP=1') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.precision=32 \
-        trainer.max_steps=4 \
-        trainer.val_check_interval=4 \
-        trainer.enable_checkpointing=False \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        exp_manager.checkpoint_callback_params.save_best_model=False \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.optim.name=distributed_fused_adam \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.num_workers=0 \
-        model.data.train_ds.concat_sampling_probabilities=[1.0]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-        model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'"
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora TP=2 SP') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.sequence_parallel=true \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
-      }
-    }
-    stage('L2: Megatron GPT Eval') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-            prompts=['How to fix GPU memory? A:'] \
-            tensor_model_parallel_size=1 \
-            inference.tokens_to_generate=32 \
-            trainer.precision=32"
-      }
-    }
-    stage('L2: Megatron GPT Eval PP2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            server=False \
-            tensor_model_parallel_size=1 \
-            pipeline_model_parallel_size=2 \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.precision=32"
-      }
-    }
-    stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
-            model.peft.restore_from_path=null \
-            model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
-            model.data.test_ds.names=['test'] \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=30 \
-            model.data.test_ds.max_seq_length=6000 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix='examples/nlp/language_modeling/out' \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \
-            rm -rf examples/nlp/language_modeling/out.jsonl"
-      }
-    }
-
-    // TODO: Add this test back. Test was failing on CI machines due to HW error
-    // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python -m torch.distributed.launch --nproc_per_node=2 \
-    //     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
-    //     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
-    //     --checkpoint_name=model_optim_rng.pt \
-    //     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
-    //     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
-    //     --model_type=gpt \
-    //     --pipeline_model_parallel_size=1 \
-    //     --gpus_per_node=2 \
-    //     --tensor_model_parallel_size=2"
-    //     sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-    //     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
-    //     --tokens_to_generate=32 \
-    //     --tensor_model_parallel_size=2 \
-    //     --prompt='This is a test.'"
-    //     sh "rm examples/nlp/language_modeling/small_gpt.nemo"
-    //   }
-    // }
-    stage('L2: Megatron Change Partitions') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel{
-        stage('Reduce TP Num Partitions (2 to 1) and PP Num Partitions (1 to 2)'){
-          steps{
-            sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file \
-                /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file \
-                /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
-                --tensor_model_parallel_size \
-                2 \
-                --target_tensor_model_parallel_size \
-                1 \
-                --pipeline_model_parallel_size \
-                1 \
-                --target_pipeline_model_parallel_size \
-                2"
-            sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo"
-          }
-        }
-        stage('Increase TP Num Partitions (2 to 4) and PP Num Partitions (1 to 2)'){
-          steps{
-            sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file \
-                /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file \
-                /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
-                --tensor_model_parallel_size \
-                2 \
-                --target_tensor_model_parallel_size \
-                4 \
-                --pipeline_model_parallel_size \
-                1 \
-                --target_pipeline_model_parallel_size \
-                1"
-            sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo"
-          }
-        }
-      }
-    }
-    stage('L2: Megatron T5 Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='fast-swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='fast-swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 Pretraining and Resume Training PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.num_moe_experts=4 \
-        model.decoder.num_moe_experts=4 \
-        model.encoder.moe_frequency=3 \
-        model.decoder.moe_frequency=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-
-    stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='normformer' \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type='normformer' \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='normformer' \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type='normformer' \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 Eval') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file \
-            /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt \
-            'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
-            --tensor_model_parallel_size 1"
-      }
-    }
-    stage('L2: Megatron BART Pretraining and Resume Training, TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
-        sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results"
-      }
-    }
-    stage('L2: Megatron BART Pretraining and Resume Training, PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='geglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='geglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
-        sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results"
-      }
-    }
-    stage('L2: Megatron T5 GLUE/XNLI Finetuning') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        // TODO(Oktai15): update it in 1.8.0 version
-        stage('T5 GLUE RTE') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=rte \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_glue_results"
-          }
-        }
-        stage('T5 GLUE XNLI') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            -cn megatron_t5_config_finetune_glue_xnli \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=2 \
-            model.data.test_ds.micro_batch_size=2 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=xnli \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            model.data.test_ds.task_name=xnli \
-            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_xnli_results"
-          }
-        }
-      }
-    }
-
-    stage('L2: Megatron T5 PEFT Lora TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/t5_lora_tuning_tp2/out' \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl'"
-        sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
-      }
-    }
-
-    stage('L2: Megatron FIM Dataset') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=1 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix='[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]' \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-        ++model.data.add_fim=True \
-        ++model.data.fim.extra_tokens.prefix='fim_prefix' \
-        ++model.data.fim.extra_tokens.middle='fim_middle' \
-        ++model.data.fim.extra_tokens.suffix='fim_suffix' \
-        ++model.data.fim.extra_tokens.pad='fim_pad' \
-        ++model.data.fim.extra_tokens.eod='endoftext'"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-      }
-    }
-
-    stage('L2: Megatron Mock Data Generation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('MockGPTDataset') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.max_steps=10 \
-            trainer.limit_val_batches=7 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.data.data_impl=mock \
-            model.data.data_prefix=[] \
-            "
-            sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-          }
-        }
-        stage('MockT5Dataset') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.max_steps=10 \
-            trainer.limit_val_batches=3 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.data.data_impl=mock \
-            model.data.data_prefix=[] \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-          }
-        }
-      }
-    }
-
-    stage('L2: TTS Fast dev runs 1') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      parallel {
-        stage('Tacotron 2') {
-          steps {
-            sh 'python examples/tts/tacotron2.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.decoder.decoder_rnn_dim=256 \
-            model.decoder.attention_rnn_dim=1024 \
-            model.decoder.prenet_dim=128 \
-            model.postnet.postnet_n_convolutions=3 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs \
-            ~trainer.check_val_every_n_epoch \
-            '
-          }
-        }
-        stage('WaveGlow') {
-          steps {
-            sh 'python examples/tts/waveglow.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.waveglow.n_flows=4 \
-            model.waveglow.n_wn_layers=2 \
-            model.waveglow.n_wn_channels=32 \
-            ~trainer.check_val_every_n_epoch'
-          }
-        }
-        stage('FastPitch') {
-          steps {
-            sh 'python examples/tts/fastpitch.py \
-            --config-name fastpitch_align_v1.05 \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.symbols_embedding_dim=64 \
-            model.input_fft.d_inner=384 \
-            model.input_fft.n_layer=2 \
-            model.output_fft.d_inner=384 \
-            model.output_fft.n_layer=2 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('RADTTS') {
-          steps {
-            sh 'python examples/tts/radtts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            export_dir=/home/TestData/radtts_test \
-            model.optim.lr=0.0001 \
-            model.modelConfig.decoder_use_partial_padding=True \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('Mixer-TTS') {
-          steps {
-            sh 'python examples/tts/mixer_tts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/sup_data \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('Hifigan') {
-          steps {
-            sh 'python examples/tts/hifigan.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            +trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.generator.upsample_initial_channel=64 \
-            +model.debug=true \
-            ~trainer.check_val_every_n_epoch'
-          }
-        }
-      }
-    }
-    stage('L2: NeRF') {
-      when {
-        anyOf {
-          branch 'r1.21.0'
-          changeRequest target: 'r1.21.0'
-        }
-      }
-      parallel {
-        stage('DreamFusion') {
-          steps {
-            sh 'python examples/multimodal/text_to_image/nerf/main.py \
-            trainer.num_nodes=1 \
-            trainer.devices="[0]" \
-            trainer.max_steps=1000 \
-            model.prompt="a DSLR photo of a delicious hamburger" \
-            exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results'
-            sh 'rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results'
-          }
-        }
-      }
-    }
-    stage('L??: Speech Checkpoints tests') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh 'CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
-            pretrained_name=QuartzNet15x5Base-En  \
-            dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
-            batch_size=64 \
-            tolerance=0.1012'
-        sh 'rm -f examples/asr/evaluation_transcripts.json'
-      }
-    }
-  }
-
-  post {
-    always {
-      sh 'chmod -R 777 .'
-      cleanWs()
-    }
-  }
-}
diff --git a/README.rst b/README.rst
index 66b3a5806c2d..89ed934527d8 100644
--- a/README.rst
+++ b/README.rst
@@ -77,63 +77,85 @@ Latest News
 
   </details>
 
+  <details open>
+    <summary><b>Speech Recognition</b></summary>
+        <details>
+          <summary><a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model</a> (2024/04/18) </summary>
+
+          The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages.
+          <br><br>
+        </details>
+
+      <details>
+        <summary><a href="https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/">Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models</a> (2024/04/18) </summary>
+
+        NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
+        <br><br>
+    </details>
+
+    <details>
+      <summary><a href="https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/">Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT</a> (2024/04/18) </summary>
+
+      NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
+      <br><br>
+    </details>
+
+  </details>
+
    
 
 
 Introduction
 ------------
 
-NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers
-working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
-and text-to-speech synthesis (TTS).
-The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia
-to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models.
+NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints.
 
 For technical documentation, please see the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_.
 
-All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
-training is automatically scalable to 1000s of GPUs.
+LLMs and MMs Training, Alignment, and Customization
+###################################################
+
+All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_.
+Training is automatically scalable to 1000s of GPUs.
+
+When applicable, NeMo models leverage cutting-edge distributed training techniques, incorporating `parallelism strategies <https://docs.nvidia.com/nemo-framework/user-guide/latest/modeloverview.html>`_ to enable efficient training of very large models. These techniques include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed Precision Training with BFloat16 and FP8, as well as others.
+
+NeMo Transformer-based LLMs and MMs utilize `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs, while leveraging `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling Transformer model training.
+
+NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM, Direct Preference Optimization (DPO), and Reinforcement Learning from Human Feedback (RLHF). See `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more information.
 
-When applicable, NeMo models take advantage of the latest possible distributed training techniques,
-including parallelism strategies such as
+In addition to supervised fine-tuning (SFT), NeMo also supports the latest parameter efficient fine-tuning (PEFT) techniques such as LoRA, P-Tuning, Adapters, and IA3. Refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html>`_ for the full list of supported models and techniques.
 
-* data parallelism
-* tensor parallelism
-* pipeline model parallelism
-* fully sharded data parallelism (FSDP)
-* sequence parallelism
-* context parallelism
-* mixture-of-experts (MoE)
+LLMs and MMs Deployment and Optimization
+########################################
 
-and mixed precision training recipes with bfloat16 and FP8 training.
+NeMo LLMs and MMs can be deployed and optimized with `NVIDIA NeMo Microservices <https://developer.nvidia.com/nemo-microservices-early-access>`_.
 
-NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs
-and leverages `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling transformer model training.
+Speech AI
+#########
 
-NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF),
-see `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more details.
+NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
-NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_.
+NeMo Framework Launcher
+#######################
 
-NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
+`NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_ is a cloud-native tool that streamlines the NeMo Framework experience. It is used for launching end-to-end NeMo Framework training jobs on CSPs and Slurm clusters. 
 
-For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
-which can be used to find the optimal model parallel configuration for training on a specific cluster.
-To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_
-The NeMo Framework Launcher does not currently support ASR and TTS training but will soon.
+The NeMo Framework Launcher includes extensive recipes, scripts, utilities, and documentation for training NeMo LLMs. It also includes the NeMo Framework `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_, which is designed to find the optimal model parallel configuration for training on a specific cluster.
 
-Getting started with NeMo is simple.
-State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
+To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_. The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
+
+Get Started with NeMo Framework
+-------------------------------
+
+Getting started with NeMo Framework is easy. State-of-the-art pretrained NeMo models are freely available on `Hugging Face Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
 These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code.
 
 We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
-can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container. <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_
-and we have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users that want to train NeMo models with the NeMo Framework Launcher.
+can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_. We also have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users who want to train NeMo models with the NeMo Framework Launcher.
 
-For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
-we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
+For advanced users who want to train NeMo models from scratch or fine-tune existing NeMo models, we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
 Key Features
 ------------
@@ -147,9 +169,9 @@ Key Features
 Requirements
 ------------
 
-1) Python 3.10 or above
-2) Pytorch 1.13.1 or above
-3) NVIDIA GPU, if you intend to do model training
+* Python 3.10 or above
+* Pytorch 1.13.1 or above
+* NVIDIA GPU (if you intend to do model training)
 
 Developer Documentation
 -----------------------
@@ -172,65 +194,61 @@ Developer Documentation
 | Stable  | |stable|    | `Documentation of the stable (i.e. most recent release) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/>`_ |
 +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
 
-
-Getting help with NeMo
+Install NeMo Framework
 ----------------------
-FAQ can be found on NeMo's `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions there.
-
-
-Installation
-------------
 
 The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable.
 
-* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions.
+* Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions.
+
+  * This is the recommended method for ASR and TTS domains.
+  * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains.
 
-  * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
-  * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains.
+* Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions.
 
-* Docker Containers - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions.
+  * NeMo Framework container - `nvcr.io/nvidia/nemo:24.05`
 
-  * This is recommended for Large Language Models (LLM), Multimodal and Vision domains.
-  * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework`
-  * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
+* LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions.
 
-* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for isntallation instructions.
-  * It's higly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
+**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.**
 
 Conda
-~~~~~
+^^^^^^
 
-We recommend installing NeMo in a fresh Conda environment.
+Install NeMo in a fresh Conda environment:
 
 .. code-block:: bash
 
     conda create --name nemo python==3.10.12
     conda activate nemo
 
-Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_.
+Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_:
 
 .. code-block:: bash
 
     conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 
-The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
+The command to install PyTorch may depend on your system. Use the configurator linked above to find the right command for your system.
+
+Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel.
 
 Pip
-~~~
-Use this installation mode if you want the latest released version.
+^^^
+
+To install the nemo_toolkit, use the following installation method:
 
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     pip install nemo_toolkit['all']
 
-Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
+Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command.
 
-Pip (Domain Specific)
-~~~~~~~~~~~~~~~~~~~~~
+Pip from a Specific Domain
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo.
+To install a specific domain of NeMo, you must first install the nemo_toolkit using the instructions listed above. Then, you run the following domain-specific commands:
 
 .. code-block:: bash
 
@@ -240,20 +258,22 @@ To install only a specific domain of NeMo, use the following commands. Note: It
     pip install nemo_toolkit['vision']
     pip install nemo_toolkit['multimodal']
 
-Pip from source
-~~~~~~~~~~~~~~~
-Use this installation mode if you want the version from a particular GitHub branch (e.g main).
+Pip from a Source Branch
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you want to work with a specific version of NeMo from a particular GitHub branch (e.g main), use the following installation method:
 
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
-From source
-~~~~~~~~~~~
-Use this installation mode if you are contributing to NeMo.
+Build from Source
+^^^^^^^^^^^^^^^^^
+
+If you want to clone the NeMo GitHub repository and contribute to NeMo open-source development work, use the following installation method:
 
 .. code-block:: bash
 
@@ -262,18 +282,16 @@ Use this installation mode if you are contributing to NeMo.
     cd NeMo
     ./reinstall.sh
 
-If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh``
-with ``pip install -e .`` when your PWD is the root of the NeMo repository.
+If you only want the toolkit without the additional Conda-based dependencies, you can replace ``reinstall.sh`` with ``pip install -e .`` when your PWD is the root of the NeMo repository.
 
-Mac computers with Apple silicon
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-To install NeMo on Mac with Apple M-Series GPU:
+Mac Computers with Apple Silicon
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- create a new Conda environment
+To install NeMo on Mac computers with the Apple M-Series GPU, you need to create a new Conda environment, install PyTorch 2.0 or higher, and then install the nemo_toolkit.
 
-- install PyTorch 2.0 or higher
+**Important: This method is only applicable to the ASR domain.**
 
-- run the following code:
+Run the following code:
 
 .. code-block:: shell
 
@@ -285,7 +303,7 @@ To install NeMo on Mac with Apple M-Series GPU:
     conda install -c conda-forge pynini
 
     # install Cython manually
-    pip install cython
+    pip install cython packaging
 
     # clone the repo and install in development mode
     git clone https://github.com/NVIDIA/NeMo
@@ -295,24 +313,22 @@ To install NeMo on Mac with Apple M-Series GPU:
     # Note that only the ASR toolkit is guaranteed to work on MacBook - so for MacBook use pip install 'nemo_toolkit[asr]'
 
 Windows Computers
-~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^
 
-One of the options is using Windows Subsystem for Linux (WSL).
-
-To install WSL:
-
-- In PowerShell, run the following code:
+To install the Windows Subsystem for Linux (WSL), run the following code in PowerShell: 
 
 .. code-block:: shell
 
     wsl --install
     # [note] If you run wsl --install and see the WSL help text, it means WSL is already installed.
 
-Learn more about installing WSL at `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+To learn more about installing WSL, refer to `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+
+After installing your Linux distribution with WSL, two options are available:
 
-After Installing your Linux distribution with WSL:
-  - **Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
-  - **Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
+**Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
+
+**Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
 
 Next, follow the instructions for Linux systems, as provided above. For example:
 
@@ -324,8 +340,11 @@ Next, follow the instructions for Linux systems, as provided above. For example:
     ./reinstall.sh
 
 RNNT
-~~~~
-Note that RNNT requires numba to be installed from conda.
+^^^^
+
+For optimal performance of a Recurrent Neural Network Transducer (RNNT), install the Numba package from Conda.
+
+Run the following code:
 
 .. code-block:: bash
 
@@ -333,14 +352,12 @@ Note that RNNT requires numba to be installed from conda.
   pip uninstall numba
   conda install -c conda-forge numba
 
-LLM and Multimodal Dependencies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Install LLMs and MMs Dependencies
+---------------------------------
 
-The LLM and Multimodal domains require three additional dependencies: 
-NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core.
+If you work with the LLM and MM domains, three additional dependencies are required: NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. When working with the `main` branch, these dependencies may require a recent commit.
 
-When working with the `main` branch these dependencies may require a recent commit.
-The most recent working versions of these dependencies are:
+The most recent working versions of these dependencies are here:
 
 .. code-block:: bash
 
@@ -349,11 +366,14 @@ The most recent working versions of these dependencies are:
   export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f
   export nv_pytorch_tag=24.02-py3
 
-When using a released version of NeMo, 
-please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ 
-for the correct versions.
+When using a released version of NeMo, please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ for the correct versions.
+
+PyTorch Container
+^^^^^^^^^^^^^^^^^
 
-If starting with a base NVIDIA PyTorch container first launch the container:
+We recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.
+
+If starting with a base NVIDIA PyTorch container, you must first launch the container:
 
 .. code-block:: bash
 
@@ -366,15 +386,14 @@ If starting with a base NVIDIA PyTorch container first launch the container:
     --ulimit stack=67108864 \
     nvcr.io/nvidia/pytorch:$nv_pytorch_tag
 
-Then install the dependencies:
+Next, you need to install the dependencies.
 
 Apex
-~~~~
-NeMo LLM Multimodal Domains require that NVIDIA Apex to be installed.
-Apex comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need to be updated to a newer version.
+^^^^
+
+NVIDIA Apex is required for LLM and MM domains. Although Apex is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
 
-To install Apex, run
+To install Apex, run the following code:
 
 .. code-block:: bash
 
@@ -383,35 +402,32 @@ To install Apex, run
     git checkout $apex_commit
     pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"
 
+When attempting to install Apex separately from the NVIDIA PyTorch container, you might encounter an error if the CUDA version on your system is different from the one used to compile PyTorch. To bypass this error, you can comment out the relevant line in the setup file located in the Apex repository on GitHub here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32.
 
-While installing Apex outside of the NVIDIA PyTorch container,
-it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with.
-This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32
+cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using.
 
-cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using:
+To install cuda-nvprof, run the following code:
 
 .. code-block:: bash
 
   conda install -c nvidia cuda-nvprof=11.8
 
-packaging is also needed:
+Finally, install the packaging:
 
 .. code-block:: bash
 
   pip install packaging
 
-With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally.
-
+To install the most recent versions of Apex locally, it might be necessary to remove the `pyproject.toml` file from the Apex directory.
 
 Transformer Engine
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
 
-The NeMo LLM Multimodal Domains require that NVIDIA Transformer Engine to be installed.
-Transformer Engine comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need Transformer Engine to be updated to a newer version.
+NVIDIA Transformer Engine is required for LLM and MM domains. Although the Transformer Engine is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
 
-Transformer Engine enables FP8 training on NVIDIA Hopper GPUs and many performance optimizations for transformer-based model training.
-Documentation for installing Transformer Engine can be found `here <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_. 
+The Transformer Engine facilitates training with FP8 precision on NVIDIA Hopper GPUs and introduces many enhancements for the training of Transformer-based models. Refer to `Transformer Enginer <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_ for information. 
+
+To install Transformer Engine, run the following code:
 
 .. code-block:: bash
 
@@ -424,14 +440,11 @@ Documentation for installing Transformer Engine can be found `here <https://docs
 Transformer Engine requires PyTorch to be built with at least CUDA 11.8.
 
 Megatron Core
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
 
-The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed.
-Megatron core is a library for scaling large transfromer base models. 
-NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, 
-transformer architectures, and optimized pytorch datasets.
+Megatron Core is required for LLM and MM domains. Megatron Core is a library for scaling large Transformer-based models. NeMo LLMs and MMs leverage Megatron Core for model parallelism, transformer architectures, and optimized PyTorch datasets.
 
-NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version.
+To install Megatron Core, run the following code:
 
 .. code-block:: bash
 
@@ -442,27 +455,27 @@ NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version
   cd megatron/core/datasets && \
   make
 
-
 NeMo Text Processing
-~~~~~~~~~~~~~~~~~~~~
-NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+--------------------
+
+NeMo Text Processing, specifically Inverse Text Normalization, is now a separate repository. It is located here: `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+
+Docker Containers
+-----------------
 
-Docker containers
-~~~~~~~~~~~~~~~~~
-We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
+NeMo containers are launched concurrently with NeMo version updates. NeMo Framework now supports LLMs, MMs, ASR, and TTS in a single consolidated Docker container. You can find additional information about released containers on the `NeMo releases page <https://github.com/NVIDIA/NeMo/releases>`_.
 
-To use a pre-built container, please run
+To use a pre-built container, run the following code:
 
 .. code-block:: bash
 
-    docker pull nvcr.io/nvidia/nemo:24.01.speech
+    docker pull nvcr.io/nvidia/nemo:24.05
 
-To build a nemo container with Dockerfile from a branch, please run
+To build a nemo container with Dockerfile from a branch, run the following code:
 
 .. code-block:: bash
 
-    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
-
+    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest
 
 If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.10-py3 and then installing from GitHub.
 
@@ -472,25 +485,32 @@ If you choose to work with the main branch, we recommend using NVIDIA's PyTorch
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
     stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.10-py3
 
-Examples
---------
 
-Many examples can be found under the `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
+Future Work
+-----------
+
+The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
 
+Discussions Board
+-----------------
 
-Contributing
-------------
+FAQ can be found on the NeMo `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions on the board.
+
+Contribute to NeMo
+------------------
 
 We welcome community contributions! Please refer to `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_ for the process.
 
 Publications
-------------
+------------------
 
 We provide an ever-growing list of `publications <https://nvidia.github.io/NeMo/publications/>`_ that utilize the NeMo Framework.
 
-If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch.
-Please refer to the instructions in the `README of that branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+To contribute an article to the collection, please submit a pull request to the ``gh-pages-src`` branch of this repository. For detailed information, please consult the README located at the `gh-pages-src branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+
+Licenses
+--------
+
+* `NeMo GitHub Apache 2.0 license <https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file#readme>`__
 
-License
--------
-NeMo is released under an `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
+* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT <https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/>`__. By pulling and using the container, you accept the terms and conditions of this license.
diff --git a/ci.groovy b/ci.groovy
deleted file mode 100644
index 27ad659b99a1..000000000000
--- a/ci.groovy
+++ /dev/null
@@ -1,119 +0,0 @@
-@Library('blossom-github-lib@master') 
-import ipp.blossom.*
-
-podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """
-apiVersion: v1
-kind: Pod
-metadata:
-  labels:
-    some-label: some-label-value
-spec:
-  volumes:
-  - name: scratch
-    nfs:
-      server: ipp1-cdot01-col01
-      path: /vol/scratch1/scratch.okuchaiev_blossom
-  containers:
-  - name: latestdlfw
-    image: nvcr.io/nvidia/pytorch:23.02-py3
-    command:
-    - cat
-    volumeMounts:
-    - name: scratch
-      mountPath: /testdata
-    resources:
-          limits:
-             nvidia.com/gpu: 2
-    restartPolicy: Never
-    backoffLimit: 4
-    tty: true
-    shm-size: 32g
-  nodeSelector:
-    kubernetes.io/os: linux
-    nvidia.com/gpu_type: "Tesla_T4x4"
-    nvidia.com/node_type: gpu_tester
-    nvidia.com/driver_version: "510.20"
-"""
-)   {
-      node(POD_LABEL) {
-          def githubHelper
-          stage('Get Token') {
-              withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) {
-                  // create new instance of helper object
-                  githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData)
-              }
-              
-          }
-          def stageName = '' 
-          try {
-              currentBuild.description = githubHelper.getBuildDescription()
-              container('latestdlfw') {
-                stage('Code checkout') {
-                    // update status on github
-                    githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING)
-                    checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]],
-                    doGenerateSubmoduleConfigurations: false,
-                    submoduleCfg: [],
-                    userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]]              
-                }
-
-                stage('Code Style') {
-                        sh "apt-get update && \
-                            apt-get install -y bc && \
-                            nvidia-smi && \
-                            pip install -r requirements/requirements_test.txt && \
-                            python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \
-                            ls -l /home && ls -l /home/TestData"
-                }
-                
-                stage('Installation') {
-                  sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release"
-                }
-
-                stage('L0: GPU unit tests') {
-                            sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'"
-                }
-
-                parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here
-                [
-                    "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \
-                            --config-path=conf \
-                            --config-name=aayn_base \
-                            do_testing=true \
-                            model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-                            model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-                            model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-                            model.encoder.pre_ln=true \
-                            model.decoder.pre_ln=true \
-                            trainer.devices=[0] \
-                            trainer.accelerator="gpu" \
-                            +trainer.fast_dev_run=true \
-                            +trainer.limit_test_batches=2 \
-                            exp_manager=null \
-                            '},
-                    "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \
-                            model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \
-                            model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \
-                            trainer.devices=[0] \
-                            trainer.accelerator="gpu" \
-                            +trainer.fast_dev_run=True \
-                            exp_manager=null \
-                            '}
-                ]
-                )//end of parallel
-              }
-              githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS)
-          }
-          catch (Exception ex){
-              currentBuild.result = 'FAILURE'
-              println ex
-              githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE)
-          }
-          
-      }
-  }
\ No newline at end of file
diff --git a/docs/source/core/core_index.rst b/docs/source/apis.rst
similarity index 74%
rename from docs/source/core/core_index.rst
rename to docs/source/apis.rst
index 01977c1b5101..e3c199bb47d5 100644
--- a/docs/source/core/core_index.rst
+++ b/docs/source/apis.rst
@@ -14,14 +14,26 @@ You can learn more about aspects of the NeMo "core" by following the links below
    :name: core
    :titlesonly:
 
-   core
-   neural_modules
-   exp_manager
-   neural_types
-   export
-   adapters/intro
-   api
+   core/core
+   core/neural_modules
+   core/exp_manager
+   core/neural_types
+   core/export
+   core/adapters/intro
 
+You can learn more about aspects of the NeMo APIs by following the links below:
+
+.. toctree::
+   :maxdepth: 1
+   :name: API
+   :titlesonly:
+
+   core/api
+   common/intro
+   nlp/api
+   multimodal/api
+   asr/api
+   tts/api
 
 
 Alternatively, you can jump straight to the documentation for the individual collections:
diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
index 2eb687d97d8e..c99d92c0371a 100644
--- a/docs/source/asr/api.rst
+++ b/docs/source/asr/api.rst
@@ -1,5 +1,5 @@
-NeMo ASR Collection API
-=======================
+NeMo ASR API
+============
 
 
 Model Classes
diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index b4656eec3f3f..a6e9cbe96c63 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -261,11 +261,6 @@ Semi Sorted Batching
 
 Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting.
 
-  .. image:: images/ssb.png
-    :align: center
-    :alt: semi sorted batching
-    :scale: 50%
-
 It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config.
 
   .. code::
@@ -772,30 +767,30 @@ To enable multimodal dataloading, we provide several configuration options:
 
 Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class):
 
-```yaml
-use_multimodal_sampling: true
-batch_tokens: 1024
-token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
-quadratic_factor: 50
-num_buckets: 30
-use_bucketing: true
-input_cfg:
-  - type: nemo_tarred
-    manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
-    tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
-    weight: 0.5
-    tags:
-      lang: en
-      prompt: "Given the following recording, transcribe what the person is saying:"
-  - type: txt_pair
-    source_path: /path/to/en__OP_0..512_CL_.txt
-    target_path: /path/to/pl__OP_0..512_CL_.txt
-    source_language: en
-    target_language: pl
-    weight: 0.5
-    tags:
-      prompt: "Translate the following text to Polish:"
-```
+.. code-block:: yaml
+
+    use_multimodal_sampling: true
+    batch_tokens: 1024
+    token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
+    quadratic_factor: 50
+    num_buckets: 30
+    use_bucketing: true
+    input_cfg:
+      - type: nemo_tarred
+        manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+        tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+        weight: 0.5
+        tags:
+          lang: en
+          prompt: "Given the following recording, transcribe what the person is saying:"
+      - type: txt_pair
+        source_path: /path/to/en__OP_0..512_CL_.txt
+        target_path: /path/to/pl__OP_0..512_CL_.txt
+        source_language: en
+        target_language: pl
+        weight: 0.5
+        tags:
+          prompt: "Translate the following text to Polish:"
 
 .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
 
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 7d1270af1267..d353b4d983dd 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -156,11 +156,11 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t
 
 .. raw:: html
 
-    <iframe src="https://hf.space/embed/nvidia/canary-1b/+"
+    <iframe src="https://nvidia-canary-1b.hf.space"
     width="100%" class="gradio-asr" allow="microphone *"></iframe>
 
     <script type="text/javascript" language="javascript">
-        $('.gradio-asr').css('height', $(window).height()+'px');
+        $('.gradio-asr').css('height', $(window).height() * 0.8+'px');
     </script>
 
 
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index 97dafcb2bf6d..f002137beb0f 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -46,12 +46,14 @@ HuggingFace Spaces to try out Parakeet models in your browser:
 * `Parakeet-TDT-1.1B <https://huggingface.co/spaces/nvidia/parakeet-tdt-1.1b>`__ space
 
 .. _Conformer_model:
+
 Conformer
 ---------
+
 .. _Conformer-CTC_model:
+
 Conformer-CTC
 ~~~~~~~~~~~~~
--------------
 
 Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a
 similar encoder as the original Conformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model.
diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst
index 735c583f9115..d45f24f807f6 100644
--- a/docs/source/asr/speech_intent_slot/api.rst
+++ b/docs/source/asr/speech_intent_slot/api.rst
@@ -15,8 +15,10 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
+    :no-index:
 
diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
index 7103243a4b20..8e6f83986032 100644
--- a/docs/source/asr/ssl/api.rst
+++ b/docs/source/asr/ssl/api.rst
@@ -15,10 +15,12 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 
 
diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst
index d1a7366164d8..76a3a75dcf37 100644
--- a/docs/source/asr/ssl/intro.rst
+++ b/docs/source/asr/ssl/intro.rst
@@ -1,5 +1,5 @@
-Self-Supervised Learning
-=================================
+Speech Self-Supervised Learning
+===============================
 
 Self-Supervised Learning (SSL) refers to the problem of learning without explicit labels. As 
 any learning process require feedback, without explit labels, SSL derives supervisory signals from 
diff --git a/docs/source/ckpt_converters/dev_guide.rst b/docs/source/ckpt_converters/dev_guide.rst
index 9faa752df2e1..601e69749b64 100644
--- a/docs/source/ckpt_converters/dev_guide.rst
+++ b/docs/source/ckpt_converters/dev_guide.rst
@@ -48,7 +48,7 @@ Script Placement and Naming Conventions
 Code Template
 -------------
 
-Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_  as an full example for development.
+Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__  as an full example for development.
 
 .. code-block:: python
 
@@ -210,7 +210,7 @@ A Simple Guide for Model Mapping and Conversion
 
 2. **Common issues when converting: results not matching between Community model and NeMo model**:
 
-   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`_ for guidance.
+   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`__ for guidance.
 
    b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L135>`_ for more details.
 
diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst
index 9de22f4b5994..451679a7e3ae 100644
--- a/docs/source/ckpt_converters/user_guide.rst
+++ b/docs/source/ckpt_converters/user_guide.rst
@@ -6,45 +6,45 @@ This guide provides instructions on how to use the conversion scripts to convert
 Support Matrix
 --------------
 
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversion           | From             | To                  | Github Link                                                                                                        |
-+======================+==================+=====================+====================================================================================================================+
-| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Conversion           | From             | To                  | Github Link                                                                                                         |
++======================+==================+=====================+=====================================================================================================================+
+| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`__         |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
 
 
 Convert Hugging Face LLaMA Checkpoints to NeMo
@@ -54,7 +54,7 @@ To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the follo
 
 .. code-block:: bash
 
-    python convert_llama_hf_to_nemo.py>`_ \
+    python convert_llama_hf_to_nemo.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
 
@@ -67,7 +67,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/pytorch_model.bin
 
@@ -75,7 +75,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/model_folder \
     --hf_input_path /path/to/input_hf_folder \
diff --git a/docs/source/collections.rst b/docs/source/collections.rst
index 1cc7a654b9c1..d4bea503513b 100644
--- a/docs/source/collections.rst
+++ b/docs/source/collections.rst
@@ -11,26 +11,9 @@ Documentation for the individual collections
    :titlesonly:
 
    nlp/nemo_megatron/intro
-   nlp/models
    nlp/machine_translation/machine_translation
    nlp/megatron_onnx_export
    nlp/quantization
-   nlp/api
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Speech AI
-   :name: Speech AI
-   :titlesonly:
-
-   asr/intro
-   asr/speech_classification/intro
-   asr/speaker_recognition/intro
-   asr/speaker_diarization/intro
-   asr/ssl/intro
-   asr/speech_intent_slot/intro
-
 
 .. toctree::
    :maxdepth: 1
@@ -42,29 +25,32 @@ Documentation for the individual collections
    multimodal/vlm/intro
    multimodal/text2img/intro
    multimodal/nerf/intro
-   multimodal/api
-
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text To Speech (TTS)
-   :name: Text To Speech
+   :caption: Vision (CV)
+   :name: vision
    :titlesonly:
 
-   tts/intro
+   vision/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Vision (CV)
-   :name: vision
+   :caption: Speech AI
+   :name: Speech AI
    :titlesonly:
 
-   vision/intro
+   asr/intro
+   asr/speech_classification/intro
+   asr/speaker_recognition/intro
+   asr/speaker_diarization/intro
+   asr/ssl/intro
+   asr/speech_intent_slot/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Common
-   :name: Common
+   :caption: Text To Speech (TTS)
+   :name: Text To Speech
    :titlesonly:
 
-   common/intro
\ No newline at end of file
+   tts/intro
diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst
index fadbd9528485..a89f1a480e5d 100644
--- a/docs/source/common/intro.rst
+++ b/docs/source/common/intro.rst
@@ -1,5 +1,5 @@
-Common Collection
-=================
+NeMo Common Collection API
+==========================
 
 The common collection contains things that could be used across all collections.
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e8fba7457605..c599f630d7f7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -113,10 +113,9 @@
     "sphinx.ext.viewcode",
     "sphinx.ext.napoleon",
     "sphinx.ext.githubpages",
-    "sphinxcontrib.bibtex",
     "sphinx.ext.inheritance_diagram",
     "sphinx.ext.intersphinx",
-    "sphinx.ext.autosectionlabel",
+    # "sphinx.ext.autosectionlabel",
     "sphinxcontrib.bibtex",
     "sphinx_copybutton",
     "sphinxext.opengraph",
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
index b0f2a8e13610..8922c72d63eb 100644
--- a/docs/source/core/adapters/api.rst
+++ b/docs/source/core/adapters/api.rst
@@ -9,6 +9,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -17,6 +18,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -28,6 +30,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,6 +38,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -47,6 +51,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -55,6 +60,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -63,3 +69,4 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
index cc2ea0b525df..d8bed1b23a75 100644
--- a/docs/source/core/adapters/components.rst
+++ b/docs/source/core/adapters/components.rst
@@ -8,7 +8,7 @@ An adapter module can be any pytorch module, but it must follow certain straight
 1) The model accepts an input of some input dimension, and its output must match this dimension.
 2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added.
 
-According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components -
+According to Junxian et al :cite:`adapters-components-Junxian2021unified`, we can consider an adapter being represented as three components -
 
 1) Functional form - the trainable parameters that will modify the input
 2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass).
@@ -17,7 +17,7 @@ According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider
 Functional Form - Adapter Networks
 ==================================
 
-Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`.
+Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-components-houlsby2019adapter`.
 
 .. note::
 
@@ -28,6 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,12 +36,13 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 
 Insertion Form - Module Adapters
 --------------------------------
 
-Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
+Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-components-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
 
 On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``.
 
@@ -70,6 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -78,6 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -87,4 +91,4 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
-    :keyprefix: adapters-
+    :keyprefix: adapters-components-
diff --git a/docs/source/core/adapters/intro.rst b/docs/source/core/adapters/intro.rst
index fd94c8d23446..8c5e9cbc8895 100644
--- a/docs/source/core/adapters/intro.rst
+++ b/docs/source/core/adapters/intro.rst
@@ -144,4 +144,5 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
+    :labelprefix: adapters
     :keyprefix: adapters-
diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst
index 6b389ca3be85..1aceb73de0d9 100644
--- a/docs/source/core/api.rst
+++ b/docs/source/core/api.rst
@@ -1,6 +1,6 @@
 
-Core APIs
-=========
+NeMo Core APIs
+==============
 
 Base class for all NeMo models
 ------------------------------
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 6e5efa56d5f0..1c9325cf0a96 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -16,9 +16,10 @@ NeMo models contain everything needed to train and reproduce Conversational AI m
 
 NeMo uses `Hydra <https://hydra.cc/>`_ for configuring both NeMo models and the PyTorch Lightning Trainer.
 
-.. note:: Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_.
+.. note::
+    Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/stable/examples>`__.
 
-The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. 
+The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`__, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem.
 
 Pretrained
 ----------
@@ -42,14 +43,14 @@ To see all available pretrained models for a specific NeMo model, use the ``list
 
 For detailed information on the available pretrained models, refer to the collections documentation: 
 
-- :ref:`Automatic Speech Recognition (ASR)`
+- :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
 - :doc:`Natural Language Processing (NLP) <../nlp/models>`
 - :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>`
 
 Training
 --------
 
-NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for model training. PyTorch Lightning lets NeMo decouple the 
+NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`__ for model training. PyTorch Lightning lets NeMo decouple the
 conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and 
 build complex AI applications without having to rewrite boiler plate code for PyTorch training.
 
@@ -298,7 +299,7 @@ With NeMo and Hydra, every aspect of model training can be modified from the com
 of experiments on compute clusters or for quickly testing parameters while developing.
 
 All NeMo `examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_ come with instructions on how to
-run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`_
+run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`__
 for an example).
 
 With Hydra, arguments are set using the ``=`` operator:
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
index b44d27c38b4b..efb55b0feabb 100644
--- a/docs/source/core/exp_manager.rst
+++ b/docs/source/core/exp_manager.rst
@@ -379,3 +379,4 @@ ExpManagerConfig
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst
index 990769452a5c..c53dd8159a60 100644
--- a/docs/source/core/export.rst
+++ b/docs/source/core/export.rst
@@ -194,7 +194,7 @@ To facilitate that, the hooks below are provided. To export, for example, 'encod
         First goes the one receiving input (input_example)
         """
 
-Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
+Some networks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
 
 .. code-block:: Python	
 
@@ -202,6 +202,7 @@ Some nertworks may be exported differently according to user-settable options (l
         """
         Sets/updates export_config dictionary
         """
+
 Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one.
 An example can be found in ``<NeMo_git_root>/nemo/collections/asr/models/rnnt_models.py``.
 
diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst
index 9003b9ca5203..ec7d94336c05 100644
--- a/docs/source/core/neural_types.rst
+++ b/docs/source/core/neural_types.rst
@@ -24,6 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i
 are expected to include both *axes* information and *element type* information.
 
 .. autoclass:: nemo.core.neural_types.NeuralType
+    :no-index:
 
 Type Comparison Results
 -----------------------
@@ -31,6 +32,7 @@ Type Comparison Results
 When comparing two neural types, the following comparison results are generated.
 
 .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult
+    :no-index:
 
 Examples
 --------
@@ -113,6 +115,7 @@ Custom element types
 It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class
 
 .. autoclass:: nemo.core.neural_types.elements.ElementType
+    :no-index:
 
 Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz.
 
diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index 0e0b3ad84402..4d363670fedf 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -3,7 +3,7 @@ Memory Optimizations
 
 Parallelism
 -----------
-Refer to :doc:`Parallelism <./parallelism>`.
+Refer to :doc:`Parallelism <./parallelisms>`.
 
 Flash Attention
 ---------------
@@ -11,38 +11,97 @@ Flash Attention
 Overview
 ^^^^^^^^
 
-Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences.
+
+Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency.
+
+The tiling technique decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step.
+
+The recomputation technique stores the softmax normalization factors (linear to sequence length), instead of the softmax results (qudratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the memory requirement and I/O traffic between global memory and shared memory.
+
+Flash attention lowers the memory footprint and computational complexity from quadratic to linear, and greatly extending the range of sequence length allowed in large language models.
+
+The flash attention algorithm was first propsed `here <https://arxiv.org/pdf/2205.14135>`_. Two of its implementations are `flash-attention <https://github.com/Dao-AILab/flash-attention>`_ by Tri Dao *et al*, and `fused flash attention <https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-897/developer-guide/index.html#flash-fused-multi-head-att-fprop>`_ by NVIDIA cuDNN.
 
 Turn Flash Attention On and Off
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``.
+In the NeMo framework, flash attention is supported through `Transformer Engine <https://github.com/NVIDIA/TransformerEngine/tree/main>`_, including both of the implementations mentioned above. Transformer Engine selects the appropriate implementation based on input information such as sequence length, number of heads and head dimension. When both implementations are applicable, Transformer Engine prefers cuDNN flash attention on Hopper+ architectures and Tri Dao flash attention on Ampere architectures.
+
+To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_ATTN=0``. To disable cuDNN flash attention, set ``NVTE_FUSED_ATTN=0``.
 
-For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
+For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
+Activation Recomputation
+------------------------
 
 Overview
 ^^^^^^^^
 
 Full Activation Recomputation
 """""""""""""""""""""""""""""
-This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
+The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
 
 Partial Activation Recomputation
 """"""""""""""""""""""""""""""""
-This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
+The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
 
 Selective Activation Recomputation
 """"""""""""""""""""""""""""""""""
-This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
+The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
+
+Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
+
+Multi-query Attention (MQA) and Grouped-query Attention (GQA)
+-------------------------------------------------------------
+
+**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms.
+
+Overview
+^^^^^^^^
+
+**Multi-query Attention (MQA)**
+    MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns.
+
+**Grouped-query Attention (GQA)**
+    GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing.
+
+These attention variants offer:
+
+- **Reduced computational load**: Both methods decrease computation, beneficial for large models.
+- **Increased processing speed**: Simplifying attention leads to faster training and inference.
+- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints.
+
+Enable MQA and GQA
+^^^^^^^^^^^^^^^^^^
+
+To use MQA or GQA in the NeMo Framework, adjust the ``num_query_groups`` parameter in the model configuration:
+
+1. **For Multi-query Attention (MQA)**:
+   - Set ``num_query_groups`` to `1` to treat all attention heads as a single group.
+
+   .. code-block:: yaml
+
+       num_query_groups: 1  # Enables Multi-query Attention
+
+2. **For Grouped-query Attention (GQA)**:
+   - Set ``num_query_groups`` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads).
+
+   .. code-block:: yaml
+
+       num_query_groups: <number_of_groups>  # Enables Grouped-query Attention
+
+   - For regular attention, set this parameter to `None` or match it with the number of heads.
+
+   .. code-block:: yaml
+
+       num_query_groups: null  # Default setting for regular multihead attention
+
+Adjust the ``num_query_groups`` to explore different attention mechanisms and optimize your model's performance based on specific needs.
+
+Implement MQA or GQA
+^^^^^^^^^^^^^^^^^^^^
 
-Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
+NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class:
 
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
+Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49
diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst
index d193752e5475..ba0dfb4e945b 100644
--- a/docs/source/features/mixed_precision.rst
+++ b/docs/source/features/mixed_precision.rst
@@ -4,3 +4,45 @@ Mixed Precision Training
 ------------------------
 
 Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly.
+
+
+FP8 usage
+=========
+
+Overview
+^^^^^^^^
+
+NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine <https://github.com/NVIDIA/TransformerEngine>`_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting <https://github.com/NVIDIA/NeMo/blob/2e1814c9f031ad2aeeebad44597365e97253d2c4/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml/#L192-L200>`_). For a more detailed overview, refer to the TE `documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_, specifically the FP8 `format <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.Format>`_ and `recipe <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.DelayedScaling>`_.
+
+.. list-table:: FP8 arguments
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Argument
+     - Description
+   * - transformer_engine
+     - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored.
+   * - fp8
+     - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support.
+   * - fp8_e4m3
+     - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format.
+   * - fp8_hybrid
+     - Training recipe format for FP8. Activations and weight tensors use the E4M3 format, whereas gradient use the E5M2 format to satisfy the additional dynamic range requirement for backward tensors. This is the default setting.
+   * - fp8_margin
+     - The scaling factor for FP8 tensors can be shifted by a factor of $2 ^ {margin}$ using this argument.
+   * - fp8_amax_history_len
+     - Window size for amax history. The window size determines how many instances of the most recent absolute max values (amaxes) are stored per tensor.
+   * - fp8_amax_compute_algo
+     - The choice between “max” and “most_recent” specifies how to select an amax value from the given history.
+   * - reduce_amax
+     - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations.
+   * - fp8_params
+     - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.fp8_model_init>`_ API in TE.
+
+Resources
+^^^^^^^^^
+
+- `TE documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_
+- `Intro to FP8, floating point formats, and mixed precision training <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8>`_
+- `Performance optimizations <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html>`_ that are natively supported in NeMo by enabling FP8 training with TE
+- `TE installation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_
diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index b10477e4232c..4cc493f40024 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -3,59 +3,246 @@
 Parallelisms
 ------------
 
-NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily):
+NeMo Megatron supports five types of parallelism (which can be mixed together arbitrarily).
+
+Data Parallelism
+^^^^^^^^^^^^^^^^
+
+Data Parallelism (DP) creates identical copies of the model across
+multiple GPUs. Data batches are distributed between GPUs so that the
+GPUs can process them independently. While compute is efficiently
+distributed between GPUs, communication is required in order to keep
+the model copies consistent with each other.
 
 Distributed Data Parallelism
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Distributed Data Parallelism (DDP) keeps model copies consistent by
+synchronizing parameter gradients before each optimization step. More
+specifically, it sums gradients over all model copies using an
+all-reduce communication collective.
 
 .. image:: ../nlp/nemo_megatron/images/ddp.gif
     :align: center
     :width: 800px
     :alt: Distributed Data Parallel
 
+Distributed Optimizer (ZeRO-1)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ZeRO-1 algorithm keeps model copies consistent by sharding the
+optimizer state between GPUs. During each optimization step, the
+parameter gradients are first summed and sharded (with a
+reduce-scatter collective), each GPU applies an optimization to its
+local shard of the parameters, and the updated parameter shards are
+broadcast to update all of the model copies (with an all-gather
+collective). This approach is attractive for large models since
+sharding the optimizer state can significantly reduce its memory
+footprint on individual GPUs. It also has, in theory, the same
+communication volume as DDP and its communication pattern has more
+opportunities for overlapping with compute.
+
+Enable Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP is the default parallelism scheme when NeMo is run on multiple
+GPUs. Enabling other parallelism schemes in the model configuration
+will decrease the size of the DP group, that is the number of
+identical model copies.
+
+To enable the distributed optimizer, set
+``model.optim.name=distributed_fused_adam`` in the model
+configuration. It can be configured with the following options:
+
+===========================  =========  ==================================================================================================================================
+Option                       Default    Description
+===========================  =========  ==================================================================================================================================
+``dtype``                    fp32       Optimizer state datatype
+``grad_sync_dtype``          ``dtype``  Gradient reduce-scatter datatype
+``overlap_grad_sync``        True       Overlap gradient reduce-scatter with compute
+``overlap_param_sync``       False      Overlap parameter all-gather with compute
+``bucket_cap_mb``            100        Buffer size (in MiB) for internal state and workspaces. Larger buckets have lower runtime overheads but may increase memory usage.
+``contiguous_param_buffer``  False      Allocate parameters as views into a large buffer. Helps avoid some data copies.
+``contiguous_grad_buffer``   True       Allocate parameter gradients as views into a large buffer. Helps avoid some data copies.
+===========================  =========  ==================================================================================================================================
+
+See the keyword arguments in `Apex DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_ and `NeMo MegatronDistributedFusedAdam <https://github.com/NVIDIA/NeMo/blob/main/nemo/core/optim/distributed_adam.py>`_ for a full list of distributed optimizer options.
+
+Implement Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP in NeMo either uses PyTorch
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
+(default) or a custom implementation (if custom multi-precision
+training is enabled with ``megatron_amp_O2``).
+
+The distributed optimizer in NeMo is built on top of
+`DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_
+from Apex.
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
-With Tensor Paralellism (TP) a tensor is split into non-overlapping pieces and
-different parts are distributed and processed on separate GPUs.
+
+Tensor Parallelism (TP) is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
 
 .. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
     :width: 800px
     :alt: Tensor Parallel
 
+Enable Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
+
+**For Tensor Parallelism**:
+
+Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
+
+   .. code-block:: yaml
+
+       tensor_model_parallel_size: 1  # Example to enable Tensor Parallelism
+
+The configuration file can be adjusted here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L65>`_.
+
+Implement Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+
+For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
-With Pipeline Paralellism (PP) consecutive layer chunks are assigned to different GPUs.
+
+Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
 
 .. image:: ../nlp/nemo_megatron/images/pp.gif
     :align: center
     :width: 800px
     :alt: Pipeline Parallel
 
+
+Enable Pipeline Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
+
+**For Pipeline Parallelism**:
+
+Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
+
+   .. code-block:: yaml
+
+       pipeline_model_parallel_size: 1  # Example to enable Pipeline Parallelism
+
+Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+
+Interleaved Pipeline Parallel Schedule
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each.
+
+   .. code-block:: yaml
+
+       virtual_pipeline_model_parallel_size: 2 # Set for interleaved pipeline
+
+For more insights into this approach, see our detailed blog: `Scaling Language Model Training <https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism>`_.
+
+Implement Pipeline Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+
+For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
+Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
+
 .. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
     :width: 800px
     :alt: Sequence Parallel
 
+Enable Sequence Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To utilize Sequence Parallelism in NeMo, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``.
+
+   .. code-block:: yaml
+
+       sequence_parallel: True  # Enable Sequence Parallelism
+
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+
+Implement Sequence Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
+
+Context Parallelism
+^^^^^^^^^^^^^^^^^^^
+
+Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
+
+Enable Context Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
+
+**For Context Parallelism**:
+
+Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
+
+   .. code-block:: yaml
+
+       context_parallel_size: 1  # Example to enable Context Parallelism
+
+The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/context_parallel.html>`_.
+
+Implement Context Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
+
+Visit our source code for more insights into the implementation:
+- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
+- `Transformer Engine attention modules <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
+
+
 Expert Parallelism
 ^^^^^^^^^^^^^^^^^^
-Expert Paralellim (EP) distributes experts across GPUs.
-
+Expert Parallelism (EP) is a type of model parallelism that distributes experts of an MoE across GPUs.
 
 .. image:: ../nlp/nemo_megatron/images/ep.png
     :align: center
     :width: 800px
     :alt: Expert Parallelism
 
+Enable Expert Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
+
+   .. code-block:: yaml
+
+       expert_model_parallel_size: 3  # Set EP to 3
+
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L68>`_.
+
+
+Implement Expert Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more MoE implementation details.
+
+
 Parallelism nomenclature
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-When reading and modifying NeMo Megatron code you will encounter the following terms.
+The following figure illustrates some terms that you may encounter in the NeMo Megatron codebase.
 
 .. image:: ../nlp/nemo_megatron/images/pnom.gif
     :align: center
diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst
index 825c3add5dfb..dfd8b6cf9310 100644
--- a/docs/source/features/throughput_optimizations.rst
+++ b/docs/source/features/throughput_optimizations.rst
@@ -71,8 +71,8 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
         python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
            model.data.train_ds.file_names=[/path/to/training.jsonl] \
            model.data.train_ds.max_seq_length=2048 \
-           model.restore_from_path=<path/to/nemo_model> \
-           +output_dir=<output_folder>
+           +tokenizer_path=/path/to/tokenizer.model \
+           +output_dir=/path/to/output_folder \
            +pack_sizes=[2048,4096,8192] \
         [  +packing_algorithm=first_fit_shuffle \  ]
         [  +seed=0                                 ]
@@ -86,10 +86,7 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
     to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
     and can be determined by examining the distribution of sequence lengths in the dataset.
 
-    Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a
-    tokenizer file is needed. This part can be improved in a future iteration of the script.
-
-    Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
     each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
     This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
     can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
@@ -135,6 +132,14 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
 
 Now you are all set to finetune your model with a much improved throughput!
 
+Sequence Packing for NeVA
+-------------------------
+
+Sequence packing in NeVA (Multimodal LLMs) differs slightly from the LLM SFT/PEFT approach. For details,
+please refer to the documentation below
+
+:doc:`../multimodal/mllm/sequence_packing`
+
 Communication Overlap
 ---------------------
 NeMo leverages Megatron-Core's optimizations to enhance bandwidth utilization and effectively overlap computation with communication. Additional details will be provided soon.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 82d3359480ca..511d3ef700c9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,7 +41,6 @@ For quick guides and tutorials, see the "Getting started" section below.
    :titlesonly:
 
    starthere/intro
-   starthere/best-practices
    starthere/tutorials
 
 For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar.
@@ -70,7 +69,7 @@ For more information, browse the developer docs for your area of interest in the
    :name: APIs
    :titlesonly:
 
-   core/core_index
+   apis
 
 .. toctree::
    :maxdepth: 1
@@ -86,4 +85,4 @@ For more information, browse the developer docs for your area of interest in the
    :name: Speech AI Tools
    :titlesonly:
 
-   tools/intro
\ No newline at end of file
+   tools/intro
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index d6f96e6c6ea4..7a9fe2822d07 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -1,5 +1,5 @@
-Multimodal API
-=======================
+NeMo Multimodal API
+===================
 
 Model Classes
 -------------
@@ -8,6 +8,7 @@ Model Classes
     :show-inheritance:
     :no-members:
     :members: __init__, configure_optimizers
+    :no-index:
 
 
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
@@ -16,18 +17,18 @@ Model Classes
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.dreambooth.dreambooth.MegatronDreamBooth
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth.MegatronDreamBooth
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.MegatronControlNet
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.MegatronControlNet
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
-.. autoclass:: nemo.collections.multimodal.models.imagen.imagen.MegatronImagen
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.imagen.imagen.MegatronImagen
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
@@ -65,7 +66,7 @@ Modules
     :members: __init__, encode
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlledUnetModel
     :show-inheritance:
     :no-members:
     :members: forward
diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst
index 46c6da631ba2..d1fe7b651e66 100644
--- a/docs/source/multimodal/mllm/checkpoint.rst
+++ b/docs/source/multimodal/mllm/checkpoint.rst
@@ -41,7 +41,7 @@ Converting Local Checkpoints
 
 The training script only auto-converts the final checkpoint into the ``.nemo`` format. To evaluate intermediate training checkpoints, conversion to ``.nemo`` might be needed. For this:
 
-.. code-block:: python
+.. code-block:: bash
 
    python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
        examples/multimodal/convert_ckpt_to_nemo.py \
@@ -59,12 +59,12 @@ NeVA Checkpoints
 
 Currently, the conversion mainly supports LLaVA checkpoints based on "llama-2 chat" checkpoints. As a reference, we'll consider the checkpoint `llava-llama-2-13b-chat-lightning-preview <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview>`_.
 
-After downloading this checkpoint and saving it at `/path/to/llava-llama-2-13b-chat-lightning-preview`, undertake the following procedures:
+After downloading this checkpoint and saving it at ``/path/to/llava-llama-2-13b-chat-lightning-preview``, undertake the following procedures:
 
 Modifying the Tokenizer
 """""""""""""""""""""""
 
-NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in `/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer`, execute the following in the NeMo container:
+NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in ``/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer``, execute the following in the NeMo container:
 
 .. code-block:: bash
 
@@ -82,7 +82,7 @@ Checkpoint Conversion
 
 For conversion:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \
      --in-file /path/to/llava-llama-2-13b-chat-lightning-preview \
@@ -99,7 +99,7 @@ NeVA Checkpoints
 
 Adjust model parallelism with:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst
index 1c64c4d317d2..2f2000124e4d 100644
--- a/docs/source/multimodal/mllm/datasets.rst
+++ b/docs/source/multimodal/mllm/datasets.rst
@@ -90,6 +90,14 @@ For NeVA training, integrating special tokens into the tokenizer is vital. After
 
 .. code-block:: bash
 
+   cd /opt; git clone https://github.com/google/sentencepiece.git && \
+     cd sentencepiece && \
+     mkdir build && \
+     cd build && \
+     cmake .. && \
+     make && \
+     make install && \
+     ldconfig
    cd /opt/sentencepiece/src/; protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto
    python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \
    --input_file /path/to/neva/tokenizers/tokenizer.model \
diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst
index 687ecd930a9e..0e76a9737a0f 100644
--- a/docs/source/multimodal/mllm/intro.rst
+++ b/docs/source/multimodal/mllm/intro.rst
@@ -10,4 +10,5 @@ The endeavor to extend Language Models (LLMs) into multimodal domains by integra
    configs
    checkpoint
    neva
-
+   video_neva
+   sequence_packing
diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst
new file mode 100644
index 000000000000..b061ee1d89c6
--- /dev/null
+++ b/docs/source/multimodal/mllm/sequence_packing.rst
@@ -0,0 +1,127 @@
+Sequence Packing for NeVA
+=========================
+
+Overview
+--------
+As outlined in the throughput optimizations section, most multimodal LLM datasets, such as the LLaVA datasets, exhibit a skewed distribution of sequence lengths. Many sequences are short, and a few are very long, conforming to Zipf’s Law. Transformer models require fixed-length inputs, necessitating padding with many unused pad tokens, which is inefficient for two reasons:
+
+1. Computation on pad values is disregarded in the final model output, resulting in wasted FLOPs.
+2. The micro batch size is often constrained by the batch containing the longest sequences, leading to underutilized GPU memory in most other batches.
+
+Sequence packing is a training technique wherein multiple training sequences (examples) are concatenated into one long sequence (pack). This approach eliminates the need for padding and allows for more tokens to be processed per micro batch, optimizing both GPU compute and memory utilization.
+
+For Sequence Packing in SFT / PEFT for LLMs, NeVA considers the following design:
+
+1. Original Datasets to Sequence Lengths Files
+
+   1.1. **PyTorch Loaders for Dataset Processing Efficiency**
+        To efficiently manage large datasets (~700K sequences), the system utilizes PyTorch's DataLoader with multi-worker capabilities, significantly speeding up the data processing phase by parallelizing the loading and pre-processing steps.
+   1.2. **Handling Large Datasets**
+        The system writes sequence lengths to disk on the fly, ensuring scalability and efficient memory usage, as loading all data into memory is impractical.
+   1.3. **Efficient I/O Operations**
+        To facilitate efficient I/O operations necessary for parallelized data loading, the system employs IndexedDataset from Megatron-Core, chosen for its ability to dynamically build binary tensor files.
+
+2. Packing Sequences into Bins
+
+   2.1. **Algorithm Choices and Performance**
+        The first_fit_decreasing and first_fit_shuffle algorithms initially used for packing sequences into bins showed performance issues due to their O(n^2) complexity, making the processing of NeVA samples time-consuming.
+   2.2. **Introduction of shuffle_and_pack**
+        To address these inefficiencies, the shuffle_and_pack algorithm was introduced, an O(n) complexity algorithm that shuffles the sequence lengths before packing them into bins sequentially, significantly improving processing time.
+   2.3. **Parallelization of Packing Process**
+        The system implements a parallelized approach to the first_fit_shuffle algorithm by dividing the samples into chunks (~20K samples each) and processing them separately, effectively mitigating the quadratic complexity problem. The bins from each chunk are then combined in the final step, enhancing overall efficiency.
+   2.4. **Efficiency Improvements with completed_bins**
+        A minor optimization involves using completed_bins to prevent the algorithm from iterating over bins that cannot accommodate the minimum sequence length, leading to a more efficient packing process.
+
+3. Reading Sequence Lengths and Packing into New Files
+   After determining the optimal bins for packing, the system reads the sequence lengths from the generated files and packs these lengths into new files based on the bins' assignments. This final step consolidates the sequences into efficiently packed bins, ready for further processing or analysis.
+
+Performance Improvement
+-----------------------
+A 40% speed increase was achieved with optimized sequence packing for sequence length w/ Vicuna-1.5 13B (LLaVA 1.5 recipe). Detailed performance metrics across different configurations and stages are provided in the tables below.
+
+Fine-tuning Performance Table:
+
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Stage        | Vision Encoder            | LLM Model      | TP | PP | Precision | Sequence Packing | Step Timing (s) | Global Batch Size | Samples / Sec | Perf Improvement  |
++==============+===========================+================+====+====+===========+==================+=================+===================+===============+===================+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 8  | 1  | BF16      | No               | 2.008           | 128               | 63.745        | 0%                |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 4  | 2  | BF16      | No               | 1.889           | 128               | 67.761        | 6%                |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 8  | 1  | BF16      | Yes              | 1.302           | 116.08            | 89.155        | 40%               |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 4  | 2  | BF16      | Yes              | 1.237           | 116.08            | 93.840        | 47%               |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+
+How to Run NeVA with Packed Sequence
+------------------------------------
+Prepare Dataset
+^^^^^^^^^^^^^^^
+We provide an easy-to-use script for preprocessing a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters.
+
+.. code-block:: bash
+
+    python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \
+     --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \
+     --image_folder=/path/to/LLaVA-Instruct-150K/images \
+     --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \
+     --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_12288_336_v1 \
+     --max_seq_length=12288 \
+     --packing_algorithm=first_fit_shuffle \
+     --hf_vision_encoder=openai/clip-vit-large-patch14-336 \
+     --conv_template=v1 \
+     --image_aspect_ratio=pad \
+     --seed=42
+
+Parameters:
+* ``--data_path``: Path to the dataset file in JSON format.
+* ``--image_folder``: Directory containing the images referenced in the dataset.
+* ``--tokenizer_path``: Path to the tokenizer model.
+* ``--output_dir``: Directory where the processed dataset will be stored.
+* ``--max_seq_length``: The maximum sequence length of the model.
+* ``--packing_algorithm``: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'.
+* ``--hf_vision_encoder``: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'.
+* ``--conv_template``: Template for data conversion. Default is 'plain', with 'v1' as an alternative.
+* ``--image_aspect_ratio``: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio.
+* ``--seed``: Seed for random operations in 'first_fit_shuffle'.
+* ``--hparams_file``: Optional path to a YAML file containing additional hyperparameters.
+
+Remarks:
+1. The current version of data processing saves processed image tensors in the sequence packing, which may require significant storage. This issue will be addressed in future iterations.
+2. The ``max_seq_length`` is crucial for achieving optimal performance. Excessive length can lead to out-of-memory errors, while insufficient length may degrade performance.
+3. The conversation prompt template is inserted during this step to ensure accurate sequence length calculation.
+
+Adjust Training Config
+""""""""""""""""""""""
+To train with packed sequences, modify four items in the SFT/PEFT config file.
+
+1. Enable the ``packed_sequence`` flag:
+
+.. code-block:: bash
+
+    ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.crop_size=[224,224]
+    ++model.data.packed_sequence=True
+
+2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached:
+
+.. code-block:: bash
+
+    ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.crop_size=[336,336]
+
+4. Adjust batch sizes:
+
+* Micro batch size should be set to 1 due to concatenation in the preprocessing step. Increase ``pack_size`` to achieve a higher micro batch size.
+* Global batch size should be adjusted based on the average number of sequences per pack (``n``), calculated as the total number of sequences divided by the number of packs. This maintains the training recipe by ensuring each gradient iteration sees, on average, the same number of tokens.
+
+.. code-block:: bash
+
+    model.micro_batch_size=1
+    model.global_batch_size=<GBS divided by n>
+
+Now, you are ready to fine-tune your model with significantly improved throughput!
diff --git a/docs/source/multimodal/mllm/video_neva.rst b/docs/source/multimodal/mllm/video_neva.rst
new file mode 100644
index 000000000000..eb0624545a3e
--- /dev/null
+++ b/docs/source/multimodal/mllm/video_neva.rst
@@ -0,0 +1,204 @@
+Video NeVA
+==========
+
+Model Introduction
+------------------
+
+Video NeVa adds support for video modality in NeVa by representing video as multiple image frames. 
+
+There is only a minor change done to :class:`~nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel` class in order to support pretraining on video input data.
+
+Representing video input as a series of images is done in :class:`~nemo.collections.multimodal.data.neva.TarOrFolderVideoLoader` class, using Decord which provides convenient video slicing methods. 
+
+
+Video Neva Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  data:
+    media_type: video
+    splice_single_frame: null
+    num_frames: 8
+    image_token_len: 256
+    image_folder: null
+    video_folder: null
+
+- ``media_type``: If set to `video`, NeVa's dataloader goes through the additional preprocessing steps to represent the input video data as a series of image frames.
+- ``splice_single_frame``: Can either be set as `first`, `middle` or `last`. This will result in only a single frame in that specific location of the video being selected.
+- ``image_token_len``: The NeVa dataloader calculates `image_token_len` based on the height and width of the preprocessed image frame and the patch size of the CLIP model being used. 
+
+.. code-block:: python
+
+  image_token_len = (224 // 14) * (224 // 14) = 16 * 16 = 256
+
+- ``num_frames``: This is used to select the number of image frames that will be used to represent the video.
+- ``video_folder``: This specifies the directory where the video files are located. This follows the same format as NeVa's `image_folder`.
+
+
+
+Inference with Video NeVA
+=========================
+
+We can run ``neva_evaluation.py`` located in ``NeMo/examples/multimodal/multimodal_llm/neva`` to generate inference results from the Video NeVA model.
+Currently, video NeVA supports both image and video inference by changing the config attribute ``inference.media_type`` in ``NeMo/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml`` to either ``image`` or ``video``, and adding the corresponding media path ``inference.media_base_path``.
+
+Inference with Pretrained Projectors with Base LM Model
+-------------------------------------------------------
+
+An example of an inference script execution:
+
+For running video inference::
+
+    CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 /path/to/neva_evaluation.py \
+    --config-path=/path/to/conf/ \
+    --config-name=neva_inference.yaml \
+    tensor_model_parallel_size=4 \
+    pipeline_model_parallel_size=1 \
+    neva_model_file=/path/to/projector/checkpoint \
+    base_model_file=/path/to/base/lm/checkpoint \
+    trainer.devices=4 \
+    trainer.precision=bf16 \
+    prompt_file=/path/to/prompt/file \
+    inference.media_base_path=/path/to/videos \
+    inference.media_type=video \
+    output_file=/path/for/output/file/ \
+    inference.temperature=0.2 \
+    inference.top_k=0 \
+    inference.top_p=0.9 \
+    inference.greedy=False \
+    inference.add_BOS=False \
+    inference.all_probs=False \
+    inference.repetition_penalty=1.2 \
+    inference.insert_media_token=right \
+    inference.tokens_to_generate=256 \
+    quantization.algorithm=awq \
+    quantization.enable=False
+
+Example format of ``.jsonl`` prompt_file::
+
+    {"video": "video_test.mp4", "text": "Can you describe the scene?", "category": "conv", "question_id": 0}
+
+input video file: video_test.mp4
+
+Output::
+
+    <extra_id_0>System
+    A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+
+    <extra_id_1>User
+    Can you describe the scene?<video>
+    <extra_id_1>Assistant
+    <extra_id_2>quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4
+    CLEAN RESPONSE: Hand with a robot arm
+
+
+Inference with Finetuned Video NeVA Model (No Need to Specify Base LM)
+----------------------------------------------------------------------
+
+An example of an inference script execution:
+
+For running video inference::
+
+    CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 /path/to/neva_evaluation.py \
+    --config-path=/path/to/conf/ \
+    --config-name=neva_inference.yaml \
+    tensor_model_parallel_size=4 \
+    pipeline_model_parallel_size=1 \
+    neva_model_file=/path/to/video/neva/model \
+    trainer.devices=4 \
+    trainer.precision=bf16 \
+    prompt_file=/path/to/prompt/file \
+    inference.media_base_path=/path/to/videos \
+    inference.media_type=video \
+    output_file=/path/for/output/file/ \
+    inference.temperature=0.2 \
+    inference.top_k=0 \
+    inference.top_p=0.9 \
+    inference.greedy=False \
+    inference.add_BOS=False \
+    inference.all_probs=False \
+    inference.repetition_penalty=1.2 \
+    inference.insert_media_token=right \
+    inference.tokens_to_generate=256 \
+    quantization.algorithm=awq \
+    quantization.enable=False
+
+
+
+Evaluation with Mixtral as a judge
+==================================
+
+We can run ``mixtral_eval.py`` localted in ``NeMo/examples/multimodal/multimodal_llm/neva`` to call mixtral api to give scores for the generated responses of two models.
+Here we use ``llava-bench-in-the-wild`` as an example.
+
+Set up
+------
+Before running the script, we need to set up ``NGC API KEY`` for calling the foundation models on NVIDIA NGC. Once you set up your account on NGC, you can login in and go to `here: <https://build.nvidia.com/mistralai/mixtral-8x7b-instruct/>`_ and click ``Get API Key``. Save the key.
+
+
+Download dataset
+----------------
+
+We first download ``llava-bench-in-the-wild`` dataset:
+
+.. code-block:: bash
+
+    git clone https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
+
+
+And download the `rule.json <https://huggingface.co/spaces/LanguageBind/Video-LLaVA/blob/main/llava/eval/table/rule.json>`_.
+
+
+Notice the answer file in ``llava-bench-in-the-wild`` is consisted of rows of json string::
+
+    {"question_id": 0, "prompt": "What is the name of this famous sight in the photo?", "answer_id": "TeyehNxHw5j8naXfEWaxWd", "model_id": "gpt-4-0314", "metadata": {}, "text": "The famous sight in the photo is Diamond Head."}
+
+
+You may also have your own response file as::
+
+    {"response_id": 0, "response": "The famous sight in the photo is Diamond Head."}
+
+
+Both formats are ok.
+
+Evaluation
+----------
+
+Install package:
+
+.. code-block:: bash
+
+    pip install shortuuid
+
+
+Now you can run the script simply by:
+
+.. code-block:: bash
+
+    API_TOKEN=nvapi-<the api you just saved> python3 NeMo/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py --model-name-list gpt bard --media-type image  \
+        --question-file llava-bench-in-the-wild/questions.jsonl \  # the question file
+        --responses-list llava-bench-in-the-wild/answers_gpt4.jsonl llava-bench-in-the-wild/bard_0718.jsonl  \   # two answer files / response files
+        --answers-dir ./  \  # to save the answers
+        --context-file llava-bench-in-the-wild/context.jsonl \  # context file
+        --output ./output.json  # the generated mixtral reviews for the two models
+
+
+You'll see the result like::
+
+    all 84.8 72.4
+    llava_bench_complex 77.0 69.0
+    llava_bench_conv 91.8 77.1
+    llava_bench_detail 91.3 73.2
+
+
+Notice when you start a new comparison, you should remove the ``output.json`` file
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/nerf/dreamfusion.rst b/docs/source/multimodal/nerf/dreamfusion.rst
index a9f2f630bcdd..d6c926392556 100644
--- a/docs/source/multimodal/nerf/dreamfusion.rst
+++ b/docs/source/multimodal/nerf/dreamfusion.rst
@@ -3,7 +3,7 @@ DreamFusion
 
 Model Introduction
 -------------------
-DreamFusion  :cite:`mm-models-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
+DreamFusion  :cite:`mm-models-df-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
 text-to-3D synthesis. The model uses a loss based on probability density distillation that enables the use of a 2D
 diffusion model as a prior for optimization of a parametric image generator.
 
@@ -306,5 +306,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DF
+    :keyprefix: mm-models-df-
diff --git a/docs/source/multimodal/text2img/controlnet.rst b/docs/source/multimodal/text2img/controlnet.rst
index 6eae36dd017a..b9f55031b79d 100644
--- a/docs/source/multimodal/text2img/controlnet.rst
+++ b/docs/source/multimodal/text2img/controlnet.rst
@@ -4,12 +4,12 @@ ControlNet
 Model Introduction
 --------------------
 
-ControlNet :cite:`mm-models-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
+ControlNet :cite:`mm-models-cn-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
 It copies the weights of neural network blocks into a "locked" copy and a "trainable" copy. The "trainable" one learns your condition. The "locked" one preserves your model. In this way, the ControlNet can reuse the SD encoder as a deep, strong, robust, and powerful backbone to learn diverse controls.
 NeMo Multimodal provides a training pipeline and example implementation for generating images based on segmentation maps. Users have the flexibility to explore other implementations using their own control input dataset and recipe.
 
 .. image:: ./images/controlnet-structure.png
-   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-controlnetgithub`)
+   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-cn-controlnetgithub`)
 
 
 ControlNet Dataset
@@ -102,5 +102,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CN
+    :keyprefix: mm-models-cn-
diff --git a/docs/source/multimodal/text2img/dreambooth.rst b/docs/source/multimodal/text2img/dreambooth.rst
index fa7e52a7ccbb..1c6a420d49f2 100644
--- a/docs/source/multimodal/text2img/dreambooth.rst
+++ b/docs/source/multimodal/text2img/dreambooth.rst
@@ -5,7 +5,7 @@ DreamBooth
 Model Introduction
 --------------------
 
-DreamBooth :cite:`mm-models-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
+DreamBooth :cite:`mm-models-db-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
 ability to mimic subjects of a given reference set. With DreamBooth, you only need a few images of a specific subject to
 fine-tune a pretrained text-to-image model, so that it learns to bind a unique identifier with a special subject. This
 unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in
@@ -28,7 +28,7 @@ NeMo's Dreambooth is built upon the Stable Diffusion framework. While its archit
 
 - Training Dataset
 
-    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
+    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-db-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
 
 Model Configuration
 --------------------
@@ -130,5 +130,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DB
+    :keyprefix: mm-models-db-
diff --git a/docs/source/multimodal/text2img/imagen.rst b/docs/source/multimodal/text2img/imagen.rst
index 9aeff2f2a061..844f68df747f 100644
--- a/docs/source/multimodal/text2img/imagen.rst
+++ b/docs/source/multimodal/text2img/imagen.rst
@@ -4,7 +4,7 @@ Imagen
 Model Introduction
 -------------------
 
-Imagen  :cite:`mm-models-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
+Imagen  :cite:`mm-models-imagen-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
 degree of photorealism and a deep level of language understanding. Given a text prompt, 
 Imagen first generates an image at a 64x64 resolution and then upsamples the generated image to 256x256 and 1024x1024 
 resolutions, all using diffusion models.
@@ -75,9 +75,9 @@ Recommended Efficient UNet size for SR256 and SR1024 models are listed below:
 Noise Scheduling / Sampler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-nichol2021improved` and EDM :cite:`mm-models-karras2022elucidating`.
+NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-imagen-nichol2021improved` and EDM :cite:`mm-models-imagen-karras2022elucidating`.
 
-Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-ho2020denoising` 
+Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-imagen-ho2020denoising` 
 represents the most widely adopted noise scheduling approach among all diffusion models. 
 Continuous DDPM introduces several modifications to the standard DDPM framework, 
 with the most significant change being the transition from a discrete noise space to a continuous space.
@@ -285,5 +285,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-IMAGEN
+    :keyprefix: mm-models-imagen-
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
index 177734584bc7..282874444738 100644
--- a/docs/source/multimodal/text2img/insp2p.rst
+++ b/docs/source/multimodal/text2img/insp2p.rst
@@ -4,7 +4,7 @@ InstructPix2Pix
 Model Introduction
 --------------------
 
-InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
+InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
 Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
@@ -79,7 +79,7 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-INSP2P
+    :keyprefix: mm-models-insp2p-
 
 
diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 9ec793d246fa..599c9bae5e15 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -13,3 +13,5 @@ NeMo multimodal provides implementations of multiple image-to-text models, inclu
    imagen
    dreambooth
    controlnet
+   insp2p
+   sdxl_quantization
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
new file mode 100644
index 000000000000..68bb7ff8d511
--- /dev/null
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -0,0 +1,160 @@
+Stable Diffusion XL Int8 Quantization
+=======================================
+
+This example shows how to use Ammo to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
+>95% of the e2e Stable Diffusion latency.
+
+We also provide instructions on deploying and running E2E SDXL pipeline
+with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
+
+To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
+which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
+
+Calibration
+---------------
+The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
+Here is the default config for ``NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py``.
+
+
+.. code-block:: yaml
+
+    quantize
+      exp_name: nemo
+      n_steps: 20          # number of inference steps
+      format: 'int8'       # only int8 quantization is supported now
+      percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+      batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+      calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+      quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+      alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
+
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- percentile: Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of (n_steps * percentile) steps. Recommendation: 1.0
+- alpha: A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL, 1.0 for SD 1.5
+- quant-level: Which layers to be quantized, 1: CNNs, 2: CNN + FFN, 2.5: CNN + FFN + QKV, 3: CNN + Linear. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+- calib-size: For SDXL, we recommend 32, 64 or 128, for SD 1.5, set to 512 or 1024.
+
+
+Build the TRT engine for the Quantized ONNX UNet
+------------------------------------------------------------
+
+.. code-block:: bash
+
+    trtexec --onnx=./nemo_onnx/unet.onnx --shapes=x:8x4x128x128,timesteps:8,context:8x80x2048,y:8x2816 --fp16 --int8 --builderOptimizationLevel=4 --saveEngine=nemo_unet_xl.plan
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+Input shape has to be provided here when building TRT engine.
+- x: Input image latent shape (B * C * H * W)
+- context: Input text conditioning (B * S * hidden_dimention)
+- y: Additional embedding (B * adm_in_channels)
+
+Build End-to-end Stable Diffusion XL Pipeline with NeMo
+-----------------------------------------------------------
+
+We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
+
+.. code-block:: yaml
+
+    infer:
+        out_path: sdxl_export
+        width: 1024
+        height: 1024
+        batch_size: 2
+
+    trt:
+      static_batch: False
+      min_batch_size: 1
+      max_batch_size: 8
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- out_path: Directory to save onnx file and TRT engine files
+- width and height: Image resolution of inference output
+- batch_size: Only used for dummy input generation and onnx sanity check
+- {min,max}_batch_size: The input batch size of TRT engine along its dynamic axis
+
+
+Run End-to-end Stable Diffusion XL TRT Pipeline
+-----------------------------------------------------------
+
+The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
+
+.. code-block:: yaml
+
+    unet_xl: sdxl_export/plan/unet_xl.plan
+    vae: sdxl_export/plan/vae.plan
+    clip1: sdxl_export/plan/clip1.plan
+    clip2: sdxl_export/plan/clip2.plan
+
+    out_path: trt_output
+
+
+Please specify unet_xl as the quantized Unet engine to run the quantized solution. The system will load the original engine file by default.
+
+Inference Speedup
+-------------------
+TRT version  9.3.0
+GPU: H100
+
+TRT int8 vs Framework fp16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Pipeline            | Batch Size | Latency (ms)| Pipeline       | Batch Size | Latency | Speedup    |
++=====================+============+=============+================+============+=========+============+
+| Framework fp16 base | 1          | 3056.01     | Ammo TRT Int8  | 1          | 1406.68 | 2.172498365|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 2          | 4832.24     | Ammo TRT Int8  | 2          | 2403.29 | 2.01067703 |
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 4          | 8433.71     | Ammo TRT Int8  | 4          | 4252.6  | 1.983189108|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+
+
+
+TRT int8 vs TRT fp16
+^^^^^^^^^^^^^^^^^^^^^^^
+
+
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| Pipeline    | Batch Size | Latency (ms) | Precision | Batch Size | Latency    | Speedup     |
++=============+============+==============+===========+============+============+=============+
+| fp16 base   | 1          | 1723.97      | Ammo Int8 | 1          | 1406.68    | 1.225559473 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 2          | 3004.47      | Ammo Int8 | 2          | 2403.29    | 1.250148754 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 4          | 5657.19      | Ammo Int8 | 4          | 4252.6     | 1.330289705 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+
+
+FP16 inference vs Int8 inference
+----------------------------------
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_1.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_1.png
+   :width: 50%
+Prompt: A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_2.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_2.png
+   :width: 50%
+Prompt: A cute corgi lives in a house made out of sushi. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_3.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_3.png
+   :width: 50%
+Prompt: A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him. (FP16 upper vs Int8 lower)
+
diff --git a/docs/source/multimodal/vlm/clip.rst b/docs/source/multimodal/vlm/clip.rst
index e28fb836ff4a..976baadb5a83 100644
--- a/docs/source/multimodal/vlm/clip.rst
+++ b/docs/source/multimodal/vlm/clip.rst
@@ -4,7 +4,7 @@ CLIP
 Model Introduction
 -------------------
 
-Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
+Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-clip-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
 
 NeMo's implementation of the CLIP model leverages its parallel transformer implementation, specifically the `nemo.collections.nlp.modules.common.megatron.transformer.ParallelTransformer`, to enable model parallelism support in both the text encoder and vision model. This design choice ensures efficient scaling and utilization of resources during training. Additionally, some of the model design and loss implementations in NeMo's CLIP are inspired by the open-source [open_clip](https://github.com/mlfoundations/open_clip) repository.
 
@@ -153,5 +153,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CLIP
+    :keyprefix: mm-models-clip-
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
index b9b4d529ba46..cb7db1ba943a 100755
--- a/docs/source/nlp/api.rst
+++ b/docs/source/nlp/api.rst
@@ -1,5 +1,5 @@
-Large language Model API
-========================
+NeMo Large language Model API
+=============================
 
 Pretraining Model Classes
 -------------------------
@@ -22,7 +22,7 @@ Pretraining Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel
     :show-inheritance: 
     :no-members:
-    :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint
+    :members: training_step, validation_step, build_train_valid_test_datasets, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel
     :show-inheritance: 
@@ -45,32 +45,27 @@ Customization Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel
     :show-inheritance: 
     :no-members:
-    :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: build_virtual_prompt_dataset, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
-
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
-    :show-inheritance: 
-    :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 Modules
 -------
@@ -86,7 +81,7 @@ Modules
     :no-members:
     :members: forward
 
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel
+.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model.NeMoBertModel
     :show-inheritance: 
     :no-members:
     :members: forward
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index a283c845b11d..26732283e8f4 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -53,7 +53,7 @@ BERT checkpoint to NeMo (mcore) using the following:
 
 Then you can fine-tune the sentence-BERT model using the following script:
 
-.. code-block:: python
+.. code-block:: bash
 
 
     #!/bin/bash
@@ -102,3 +102,107 @@ Then you can fine-tune the sentence-BERT model using the following script:
     exp_manager.wandb_logger_kwargs.name=${NAME} \
     exp_manager.wandb_logger_kwargs.project=${PROJECT}
     
+GPT Embedding Models
+=====================
+
+Recent work has also shown that it is possible to use Decoder-Only (GPT Style) models to train embedding models.
+`Improving Text Embeddings with
+Large Language Models <https://arxiv.org/pdf/2401.00368.pdf>`__ is one such recent papers which served as inspiration to implement Decoder-only embedding training in Nemo.
+
+Training a GPT Embedding Model
+-------------------------------
+
+To train GPT Embedding models we follow a format very similar to the SBERT Embedding training. However, there are a couple of differences. GPT Embedding model training expects a `jsonl` file in which each line is a json object. Here is a truncated example of data jsonl file::
+
+{"query": "What did ... 1952-2002 period?", "pos_doc": "Morning (2008) ... has changed little.", "neg_doc": "Even though ... sapiens.", "query_id": "q103151", "doc_id": "d14755"}
+{"query": "What type of ...  passions?", "pos_doc": "Burke was a leading ... upper classes.", "neg_doc": "Writing to a friend ... Government.", "query_id": "q77959", "doc_id": "d11263"}
+{"query": "Since 1999, ... progressed at?", "pos_doc": "Commercial solar water ... as of 2007.", "neg_doc": "The potential solar ... acquire.", "query_id": "q16545", "doc_id": "d1883"}
+
+
+As visible the json object should contain the following fields ``query``, ``pos_doc``, ``neg_doc``, ``query_id`` and ``doc_id``. The ``query_id`` and ``doc_id`` can be any alphanumeric string that uniquely maps to the ``query`` string and ``pos_doc`` string.
+
+During training, the GPT Embedding model employs LoRA (by default) to learn embeddings for the queries and documents, such that similarity of the ``query``-to-``pos_doc`` are maximized while simultaneously minimizing ``query``-to-``neg_doc`` similarity. LoRA allows us to fine-tune large LLMs such as Mistral 7B model with a relatively small number of training parameters.
+
+An example command to launch a training job is
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+    exp_manager.exp_dir="PATH_TO_SAVE_LORA_WEIGHTS" \
+    model.global_batch_size=4 \                         # exact choice for global batch size is data dependent typical values are in the range of 32 to 128.
+    model.micro_batch_size=4 \                          # exact choice for micro batch size is GPU memory dependent 2 to 8 are reasonable values.
+    trainer.devices=1 \                                 # indicates how many GPUs to use during training per node.
+    trainer.num_nodes=1 \                               # indicates how many nodes to use if multi-node cluster is available
+    trainer.max_steps=20 \                              # how many training steps to run.
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \
+    model.peft.lora_tuning.adapter_dim=16 \             # the low-rank size for lora weights.
+    model.data.train_ds.file_names=["train.jsonl"]
+
+The full list of possible run arguments is configurable in ``/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml``. By default a trained model file should be generated in here ``PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints/`` typically with the extension ``.nemo``.
+
+
+Inference using a GPT Embedding Model
+-------------------------------------
+
+Once trained, the GPT Embedding Model can be used to generate embeddings for queries and corpus documents. We can launch inference using the following command:
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+    model.global_batch_size=4 \
+    model.micro_batch_size=4 \
+    trainer.devices=1 \
+    trainer.num_nodes=1 \
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \  # Same base model used at training time. 
+    model.peft.restore_from_path="PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints//megatron_gpt_peft_lora_tuning.nemo" \ 
+    model.data.test_ds.query_file_names=["test_query.jsonl"] \
+    model.data.test_ds.doc_file_names=\["test_docs.jsonl"] \
+    model.data.test_ds.write_embeddings_to_file=True \
+    model.data.test_ds.output_file_path_prefix="PATH_TO_SAVE_EMEBDDINGS" 
+
+The contents of ``test_queries.jsonl`` is expected to be in the following format::
+
+{"query": "What do ... quantities?","query_id": "q11600", "doc_id": "d1172"}
+{"query": "What are ... subsectors?", "query_id": "q5831", "doc_id": "d577"}
+{"query": "Which article ... Government?", "query_id": "q3037", "doc_id": "d336"}
+
+Here, the ``doc_id`` field is expected to be the id of the document/passage which is the correct passage for the query. Note that since we are in inference mode, we don't require query-doc pairs.
+
+The contents of ``test_docs.jsonl`` is expected to be in the following format::
+
+{"pos_doc": "Hormones ... vitamin D.", "doc_id": "d823"}
+{"pos_doc": "Historically, Victoria ... October 2016.", "doc_id": "d159"}
+{"pos_doc": "Exceptional examples ... Warsaw.", "doc_id": "d1084"}
+
+Once again, we show 3 examples form each file. Typically the ``test_docs.jsonl`` will contain more items than queries in the ``test_queries.jsonl``.
+
+The inference command will result in two folders 
+
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries`` 
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_docs``
+
+The ``X`` in the folder ``consumed_samplesX`` is a number denoted number of batches consumed, this is not crucial at test time, but it is useful in training which we will see in the next section. First, let's take a look at the ``test_queries``.
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries
+ query.ids  query.npy
+ $>head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries/query.ids 
+ q11600
+ q5831
+ q3037
+
+``query.npy`` is a numpy pickled array containing rows of query embeddings and the ``query.ids`` text file list the id of each embedding in the same order.
+
+Similarly let's look into the ``test_docs`` folder
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/
+ doc.ids  doc.npy
+ $> head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/doc.ids 
+ d823
+ d159
+ d1084
+
+We can see that ``test_doc`` has a similar structure to ``test_queries`` but with ids and embeddings of the documents from the ``test_docs.josnl`` file. With this setup it is possible to evaluate the performance using metrics like MRR or NDCG.
diff --git a/docs/source/nlp/machine_translation/machine_translation.rst b/docs/source/nlp/machine_translation/machine_translation.rst
index 190ac5b07da9..f58c67551abe 100644
--- a/docs/source/nlp/machine_translation/machine_translation.rst
+++ b/docs/source/nlp/machine_translation/machine_translation.rst
@@ -470,12 +470,12 @@ NMT with bottleneck encoder architecture is also supported (i.e., fixed size bot
 
 1. Supported  learning frameworks (**model.model_type**):
     * NLL - Conditional cross entropy (the usual NMT loss)
-    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`_)
-    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`_)
+    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`__)
+    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`__)
 2. Supported encoder architectures (**model.encoder.arch**):
     * seq2seq - the usual transformer encoder without a bottleneck
-    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`_)
-    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`_)
+    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`__)
+    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`__)
 
 
 +----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+
diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index 2e94cc45b40f..efc2ac3f8439 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -70,7 +70,7 @@ Note that training tokenizer model will also take some time.
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
         --split_digits true
 
-After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary.
+After this is done (will take a while), you'll have two files: ``spm_32k_wiki.model`` and ``spm_32k_wiki.vocab`` corresponding to the model and vocabulary.
 
 **Step 4: Convert training data into memory map format**
 
diff --git a/docs/source/nlp/nemo_megatron/positional_embeddings.rst b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
index 332ce304049d..cac0bb452f58 100644
--- a/docs/source/nlp/nemo_megatron/positional_embeddings.rst
+++ b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
@@ -18,38 +18,38 @@ GPT
      - .. code::
           
           model.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **rope**
      - .. code::
 
           model.position_embedding_type='rope'
           model.rotary_percentage=1.0
-     - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations. It achieves this by leveraging the geometric properties of vectors and complex numbers and applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
-
+     - Rotary Position Embedding (RoPE) :cite:`pos-emb-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations by leveraging the geometric properties of vectors and complex numbers, applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
+   
    * - **alibi**
      - .. code::
 
           model.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
    * - **xpos**
      - .. code::
 
           model.position_embedding_type='xpos'
-     - Extrapolatable Position Embedding (xPos) :cite:`nlp-megatron-sun2022lengthextrapolatable`
+     - Extrapolatable Position Embedding (xPos) :cite:`pos-emb-sun2022lengthextrapolatable`
 
    * - **sandwich**
      - .. code::
 
           model.position_embedding_type='sandwich'
-     - Sandwich :cite:`nlp-megatron-chi2023dissecting`
+     - Sandwich :cite:`pos-emb-chi2023dissecting`
 
 T5
 ^^
@@ -67,32 +67,32 @@ T5
 
           model.encoder.position_embedding_type='learned_absolute'
           model.decoder.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **relative**
      - .. code::
 
           model.encoder.position_embedding_type='relative'
           model.decoder.position_embedding_type='relative'
-     - Relative Position Representations :cite:`nlp-megatron-shaw2018selfattention`
+     - Relative Position Representations :cite:`pos-emb-shaw2018selfattention`
 
    * - **alibi**
      - .. code::
 
           model.encoder.position_embedding_type='alibi'
           model.decoder.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.encoder.position_embedding_type='kerple'
           model.decoder.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
 Positional interpolation
 ------------------------
-Position Interpolation (PI) :cite:`nlp-megatron-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
+Position Interpolation (PI) :cite:`pos-emb-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
 
 Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it.
 
@@ -107,5 +107,5 @@ References
 
 .. bibliography:: ../nlp_all.bib
     :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
+    :labelprefix: pos-emb
+    :keyprefix: pos-emb-
\ No newline at end of file
diff --git a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
index 8314676e5c4c..4cd13abd2264 100644
--- a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
+++ b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
@@ -36,7 +36,7 @@ Quick Start Guide
 Model Description
 -----------------
 In addition to :doc:`Punctuation And Capitalization model <./punctuation_and_capitalization>` we add audio encoder (e.g. Conformer's encoder) and attention based fusion of lexical and audio features.
-This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-sunkara20_interspeech`.
+This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-lex-sunkara20_interspeech`.
 
 .. note::
 
@@ -386,6 +386,6 @@ References
 
 .. bibliography:: nlp_all.bib
     :style: plain
-    :labelprefix: NLP-PUNCT
-    :keyprefix: nlp-punct-
+    :labelprefix: NLP-PUNCT-LEX
+    :keyprefix: nlp-punct-lex-
 
diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
index afe2e9eccbca..cc40b6a972a2 100644
--- a/docs/source/nlp/quantization.rst
+++ b/docs/source/nlp/quantization.rst
@@ -10,7 +10,7 @@ PTQ enables deploying a model in a low-precision format -- FP8, INT4, or INT8 --
 
 Model quantization has two primary benefits: reduced model memory requirements and increased inference throughput.
 
-In NeMo, quantization is enabled by the Nvidia AMMO library -- a unified algorithmic model optimization & deployment toolkit.
+In NeMo, quantization is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to quantize and compress deep learning models for optimized inference on GPUs.
 
 The quantization process consists of the following steps:
 
@@ -18,10 +18,52 @@ The quantization process consists of the following steps:
 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
 3. Producing an output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
 
-Loading models requires using an AMMO spec defined in `megatron.core.inference.gpt.model_specs.py <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/inference/gpt/model_specs.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
+Loading models requires using an ModelOpt spec defined in `nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec <https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
 
 Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
 
+Support Matrix
+^^^^^^^^^^^^^^
+
+Table below presents verified model support matrix for popular LLM architectures. Each model entry also optionally provides a download link to a corresponding Nemo checkpoint for testing purposes. Support for other model families is experimental.
+
+.. list-table:: Model Support Matrix
+   :widths: 15 15 15 15
+   :header-rows: 1
+
+   * - **Model Family**
+     - **FP8**
+     - **INT8_SQ**
+     - **INT4_AWQ**
+   * - Llama (1, 2, 3)
+     - ✅
+     - ✅
+     - ✅
+   * - Mistral
+     - ✅
+     - ✅
+     - ✅
+   * - `GPT-3 <https://huggingface.co/nvidia/GPT-2B-001>`_
+     - ✅
+     - ✅
+     - ✅
+   * - `Nemotron-3 8b <https://huggingface.co/nvidia/nemotron-3-8b-base-4k>`_
+     - ✅
+     - ✅
+     - ✅
+   * - Nemotron-4 15b
+     - ✅
+     - ✅
+     - ✅
+   * - StarCoder 2
+     - ✅
+     - ✅
+     - ✅
+   * - Gemma
+     - ✅
+     - ✅
+     - ✅
+
 
 Example
 ^^^^^^^
@@ -31,7 +73,7 @@ The script must be launched correctly with the number of processes equal to tens
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_llama_quantization.py \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
         model_file=llama2-70b-base-bf16.nemo \
         tensor_model_parallel_size=8 \
         pipeline_model_parallel_size=1 \
diff --git a/docs/source/nlp/text_normalization/nn_text_normalization.rst b/docs/source/nlp/text_normalization/nn_text_normalization.rst
index 0de5ccefef05..87530dbcbc29 100644
--- a/docs/source/nlp/text_normalization/nn_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/nn_text_normalization.rst
@@ -26,7 +26,7 @@ The term *duplex* refers to the fact that our system can be trained to do both T
 Quick Start Guide
 -----------------
 
-To run the pretrained models interactively see :ref:`inference_text_normalization`.
+To run the pretrained models interactively see :ref:`inference_text_normalization_nn`.
 
 Available models
 ^^^^^^^^^^^^^^^^
@@ -79,7 +79,7 @@ The purpose of the preprocessing scripts is to standardize the format in order t
 We also changed punctuation class `PUNCT` to be treated like a plain token ( label changed from `<sil> to ``<self>`), since we want to preserve punctuation even after normalization. 
 For text normalization it is crucial to avoid unrecoverable errors, which are linguistically coherent and not semantic preserving. 
 We noticed that due to data scarcity the model struggles verbalizing long numbers correctly, so we changed the ground truth for long numbers to digit by digit verbalization.
-We also ignore certain semiotic classes from neural verbalization, e.g. `ELECTRONIC` or `WHITELIST` -- `VERBATIM` and `LETTER` in the original dataset. Instead we label urls/email addresses and abbreviations as plain tokens, and handle it separately with WFST-based grammars, see :ref:`inference_text_normalization`.
+We also ignore certain semiotic classes from neural verbalization, e.g. `ELECTRONIC` or `WHITELIST` -- `VERBATIM` and `LETTER` in the original dataset. Instead we label urls/email addresses and abbreviations as plain tokens, and handle it separately with WFST-based grammars, see :ref:`inference_text_normalization_nn`.
 This simplifies the task for the model and significantly reduces unrecoverable errors.
 
 
@@ -199,7 +199,7 @@ To enable training with the tarred dataset, add the following arguments:
     data.train_ds.use_tarred_dataset=True \
     data.train_ds.tar_metadata_file=\PATH_TO\<TARRED_DATA_OUTPUT_DIR>\metadata.json
 
-.. _inference_text_normalization:
+.. _inference_text_normalization_nn:
 
 Model Inference
 ---------------
@@ -230,16 +230,16 @@ To run inference from a file adjust the previous command by
 
 This pipeline consists of 
     
-    * WFST-based grammars to verbalize hard classes, such as urls and abbreviations.
-    * regex pre-preprocssing of the input, e.g.
-        * adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
-        * converting unicode fraction e.g. ½ to 1/2
-        * normalizing greek letters and some special characters, e.g. `+` -> `plus`
-    * Moses :cite:`nlp-textnorm-koehnetal2007moses`. tokenization/preprocessing of the input
-    * inference with neural tagger and decoder
-    * Moses postprocessing/ detokenization
-    * WFST-based grammars to verbalize some `VERBATIM`
-    * punctuation correction for TTS (to match  the output punctuation to the input form)
+* WFST-based grammars to verbalize hard classes, such as urls and abbreviations.
+* regex pre-preprocssing of the input, e.g.
+    * adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
+    * converting unicode fraction e.g. ½ to 1/2
+    * normalizing greek letters and some special characters, e.g. `+` -> `plus`
+* Moses :cite:`nlp-textnorm-koehnetal2007moses` tokenization/preprocessing of the input
+* inference with neural tagger and decoder
+* Moses postprocessing/ detokenization
+* WFST-based grammars to verbalize some `VERBATIM`
+* punctuation correction for TTS (to match  the output punctuation to the input form)
 
 Model Architecture
 ------------------
diff --git a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
index 672226622357..07e1fbd7702c 100644
--- a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
+++ b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
@@ -20,7 +20,7 @@ An example bash-script that runs inference and evaluation is provided here: `run
 Quick Start Guide
 -----------------
 
-To run the pretrained models see :ref:`inference_text_normalization`.
+To run the pretrained models see :ref:`inference_text_normalization_tagging`.
 
 Available models
 ^^^^^^^^^^^^^^^^
@@ -59,7 +59,7 @@ In the example, ``<self>`` denotes that the spoken form is the same as the writt
     <eos>	<eos>
 
 
-More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-sproat2016rnn`.
+More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-tag-sproat2016rnn`.
 
 
 Data preprocessing
@@ -115,7 +115,7 @@ Example of a training command:
 
 
 
-.. _inference_text_normalization:
+.. _inference_text_normalization_tagging:
 
 Model Inference
 ---------------
@@ -146,7 +146,7 @@ contextualized representation for each input token. It then uses a classificatio
 to predict the tag for each token. Another classification head is used to predict a "semiotic" class label for each token.
 
 Overall, our design is partly inspired by the LaserTagger approach proposed in the paper
-`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-malmi2019encode`.
+`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-tag-malmi2019encode`.
 
 The LaserTagger method is not directly applicable to ITN because it can only regard the whole non-common fragment as a single
 replacement tag, whereas spoken-to-written conversion, e.g. a date, needs to be aligned on a more granular level. Otherwise,
@@ -161,5 +161,5 @@ References
 
 .. bibliography:: tn_itn_all.bib
     :style: plain
-    :labelprefix: NLP-TEXTNORM
-    :keyprefix: nlp-textnorm-
+    :labelprefix: NLP-TEXTNORM-TAG
+    :keyprefix: nlp-textnorm-tag-
diff --git a/docs/source/nlp/text_normalization/wfst/intro.rst b/docs/source/nlp/text_normalization/wfst/intro.rst
index a5d6ab3a8c5d..9805345b30b8 100644
--- a/docs/source/nlp/text_normalization/wfst/intro.rst
+++ b/docs/source/nlp/text_normalization/wfst/intro.rst
@@ -5,7 +5,7 @@ NeMo-text-processing supports Text Normalization (TN), audio-based TN and Invers
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`__ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`__ repository. All updates and discussions/issues should go to the new repository.
 
 
 WFST-based TN/ITN:
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
index 7e1a34c3864e..8fab07e6e278 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
@@ -5,8 +5,7 @@ Text (Inverse) Normalization
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
-
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`_ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_ repository. All updates and discussions/issues should go to the new repository.
 
 The `nemo_text_processing` Python package is based on WFST grammars :cite:`textprocessing-norm-mohri2005weighted` and supports:
 
@@ -188,7 +187,7 @@ Language Support Matrix
 
 See :ref:`Grammar customization <wfst_customization>` for grammar customization details.
 
-See :ref:`Text Processing Deployment <wfst_text_processing_deployment>` for deployment in C++ details.
+See :doc:`Text Processing Deployment <./wfst_text_processing_deployment>` for deployment in C++ details.
 
 WFST TN/ITN resources could be found in :ref:`here <wfst_resources>`.
 
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
index 396a7cde578e..4d584e13526b 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
@@ -5,14 +5,13 @@ Deploy to Production with C++ backend
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
-
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`_ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_ repository. All updates and discussions/issues should go to the new repository.
 
 NeMo-text-processing provides tools to deploy :doc:`TN and ITN <wfst_text_normalization>` for production :cite:`textprocessing-deployment-zhang2021nemo`.
 It uses `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-deployment-sparrowhawk` -- an open-source C++ framework by Google.
 The grammars written with NeMo-text-processing can be exported into an `OpenFST <https://www.openfst.org/>`_ Archive File (FAR) and dropped into Sparrowhawk.
 
-    .. image:: images/deployment_pipeline.png
+    .. image:: ./images/deployment_pipeline.png
         :align: center
         :alt: Deployment pipeline
         :scale: 50%
diff --git a/docs/source/starthere/best-practices.rst b/docs/source/starthere/best-practices.rst
index ec0fea1985cc..759ee108ed7b 100644
--- a/docs/source/starthere/best-practices.rst
+++ b/docs/source/starthere/best-practices.rst
@@ -23,7 +23,7 @@ NeMo excels in training large-scale LLM & MM, utilizing optimizations from Megat
 - Advanced checkpointing through the Distributed Checkpoint Format.
 
 Speech AI
---------
+---------
 
 Data Augmentation
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 63fdcfb0406e..ebbe1551c39e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -96,13 +96,13 @@ This section details the steps to clone and install the Megatron Core.
     git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
     pip install .
 
-AMMO Installation
+Model Optimizer Installation
 
-This final step involves installing the AMMO package.
+This final step involves installing the Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+    pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash
diff --git a/docs/source/starthere/migration-guide.rst b/docs/source/starthere/migration-guide.rst
index 1d9816493a5b..7005873e5343 100644
--- a/docs/source/starthere/migration-guide.rst
+++ b/docs/source/starthere/migration-guide.rst
@@ -8,39 +8,39 @@ Upgrade guide to use lightning 2.0
 
 .. _dummy_header:
 
-* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`_.
+* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`__.
 
-* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`_.
+* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`__.
 
 * Set ``trainer.strategy = "ddp_find_unused_parameters_true"`` if there are unused parameters in your model as lightning 2.0 has find_unused_parameters as False by default. 
   
-    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`_.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`_.
+    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`__.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`__.
 
 
-* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`_.
+* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`__.
 
-* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`_.
+* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`__.
 
 * To set or get ``ckpt_path`` use ``trainer.ckpt_path`` directly instead of calling protected API via ``trainer._checkpoint_connector._ckpt_path`` or using ``trainer._checkpoint_connector.resume_from_checkpoint_fit_path``.
 
 * Change ``import load`` from pytorch_lightning.utilities.cloud_io to ``import _load``.
 
-* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`_. 
+* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`__.
 
 * Lightning 2.0 adds ``'16-mixed'``, ``'bf16-mixed'`` as the preicison values for fp16 mixed precision and bf16 mixed precision respectively. 
   
-    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`_ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`_. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`_.
+    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`__ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`__. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`__.
 
-* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`_. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`_.
+* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`__. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`__.
 
 * Lightning 2.0 is not currently supporting multiple dataloders for validation and testing in case of ``dataloader_iter``. The support for this will be added back soon in an upcoming release. If ``dataloader_iter`` is being used and your config passes multiple files to ``validation_ds.file_names`` or ``test_ds.file_names``, please use just one file until this issue is fixed with pytorch lightning.
 
 * With lightning 2.0 it's required to set ``limit_val_batches`` and ``num_sanity_val_steps`` to be a multiple of number of microbatches while using ``dataloader_iter`` (applies only to Megatron files that use dataloader_iter) for all pretraining files (not downstream tasks like finetuning). This is being taken care internally in NeMo and does not require anything to be done by the user. However, if you are a developer of NeMo and are building a new model for pretraining that uses ``dataloader_iter`` instead of batch in ``validation_step`` methods please make sure to call ``self._reconfigure_val_batches()`` in ``build_train_valid_test_datasets method`` of your model.
 
 * If model is being wrapped with ``LightningDistributedModule`` in ``configure_ddp`` method please replace it with ``_LightningModuleWrapperBase`` 
-  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`_.
+  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`__.
 
-* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`_.
+* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`__.
 
 * If any of the tests use CPU as the device, ensure to explicitly pass it in the trainer as ``trainer = pl.Trainer(max_epochs=1, accelerator='cpu')`` since deafult val in PTL >= 2.0 is auto and it picks cuda.
 
diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
index 5ca48904ed9b..0298dbdf6d4b 100644
--- a/docs/source/starthere/tutorials.rst
+++ b/docs/source/starthere/tutorials.rst
@@ -63,6 +63,9 @@ Tutorial Overview
    * - Multimodal
      - Preparations and Advanced Applications: DreamBooth Tutorial
      - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/DreamBooth%20Tutorial.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: Stable Diffusion XL Quantization Tutorial
+     - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/SDXL%20Quantization.ipynb>`_
 
 .. list-table:: **Automatic Speech Recognition (ASR) Tutorials**
    :widths: 15 30 55
diff --git a/docs/source/tools/intro.rst b/docs/source/tools/intro.rst
index 5a08d05f3405..b38e435353c6 100644
--- a/docs/source/tools/intro.rst
+++ b/docs/source/tools/intro.rst
@@ -20,3 +20,4 @@ There are also additional NeMo-related tools hosted in separate github repositor
    :maxdepth: 1
 
    speech_data_processor
+   ../nlp/text_normalization/intro
diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index aa8d2139653f..df872e7d2195 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -12,14 +12,14 @@ NFA can be used on long audio files of 1+ hours duration (subject to your hardwa
 Demos & Tutorials
 -----------------
 
-* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`_ to quickly try out NFA in various languages.
-* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`_.
-* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`_.
+* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`__ to quickly try out NFA in various languages.
+* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`__.
+* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`__.
 
 Quickstart
 ----------
 
-1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`_.
+1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`__.
 2. Prepare a NeMo-style manifest containing the paths of audio files you would like to proces, and (optionally) their text.
 3. Run NFA's ``align.py`` script with the desired config, e.g.:
 
diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst
index 3e9b06b4e9a9..8664adafa6d7 100644
--- a/docs/source/tts/api.rst
+++ b/docs/source/tts/api.rst
@@ -1,5 +1,5 @@
-NeMo TTS Collection API
-=======================
+NeMo TTS API
+============
 
 Model Classes
 -------------
diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv
index d46567012600..6827c54ce7f4 100644
--- a/docs/source/tts/data/ngc_models_codec.csv
+++ b/docs/source/tts/data/ngc_models_codec.csv
@@ -1,2 +1,4 @@
 Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint
 audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small <https://ngc.nvidia.com/catalog/models/nvidia:nemo:audio_codec_16khz_small>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo``
+mel_codec_22khz_medium,LibriVox and Common Voice,22050Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_22khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_22khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo``
+mel_codec_44khz_medium,LibriVox and Common Voice,44100Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_44khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_44khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo``
diff --git a/docs/source/tts/intro.rst b/docs/source/tts/intro.rst
index 3964319234b3..b7d717e7ac68 100644
--- a/docs/source/tts/intro.rst
+++ b/docs/source/tts/intro.rst
@@ -15,8 +15,6 @@ We will illustrate details in the following sections.
     datasets
     checkpoints
     configs
-    api
-    resources
     g2p
 
 .. include:: resources.rst
diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst
index 6f9d7d24c45d..7ea5caa4d871 100644
--- a/docs/source/tts/models.rst
+++ b/docs/source/tts/models.rst
@@ -140,9 +140,10 @@ Codecs
 Audio Codec
 ~~~~~~~~~~~
 
-The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals.
+The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals or mel-spectrogram features.
 The NeMo Audio Codec model supports residual vector quantizer (RVQ) :cite:`tts-models-zeghidour2022soundstream` and finite scalar quantizer (FSQ) :cite:`tts-models-mentzer2023finite` for quantization of the encoder output.
 This model is trained end-to-end using generative loss, discriminative loss, and reconstruction loss, similar to other neural audio codecs such as SoundStream :cite:`tts-models-zeghidour2022soundstream` and EnCodec :cite:`tts-models-defossez2022encodec`.
+For further information refer to the ``Audio Codec Training`` tutorial in the TTS tutorial section.
 
     .. image:: images/audiocodec_model.png
         :align: center
diff --git a/docs/source/vision/checkpoint.rst b/docs/source/vision/checkpoint.rst
index 7e3e197a1169..49848b90d51a 100644
--- a/docs/source/vision/checkpoint.rst
+++ b/docs/source/vision/checkpoint.rst
@@ -63,7 +63,7 @@ ViT Checkpoints
 
 To adjust model parallelism from original model parallelism size to a new model parallelism size (Note: NeMo ViT currently only supports `pipeline_model_parallel_size=1`):
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/vision/vit.rst b/docs/source/vision/vit.rst
index 679313bcbd66..a7b4e2546f22 100644
--- a/docs/source/vision/vit.rst
+++ b/docs/source/vision/vit.rst
@@ -4,7 +4,7 @@ ViT
 Model Introduction
 -------------------
 
-The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit`, serves as a foundational model
+The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit-vit`, serves as a foundational model
 for image classification tasks in NeMo. Unlike conventional convolutional neural networks, ViT adopts a transformer-like
 architecture to process image data. In this approach, an image is divided into fixed-size patches, typically
 14x14 or 16x16. These patches are linearly embedded and augmented with position embeddings. The resulting
@@ -136,5 +136,5 @@ Reference
 .. bibliography:: ./vision_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: VISION-MODELS
-    :keyprefix: vision-models-
+    :labelprefix: VISION-MODELS-VIT
+    :keyprefix: vision-models-vit-
diff --git a/examples/asr/asr_chunked_inference/README.md b/examples/asr/asr_chunked_inference/README.md
index 5b4c79613ed9..fec2e2901c18 100644
--- a/examples/asr/asr_chunked_inference/README.md
+++ b/examples/asr/asr_chunked_inference/README.md
@@ -1,6 +1,6 @@
-# Streaming / Buffered ASR
+# Streaming / Buffered / Chunked ASR
 
-Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models.
+Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models, and chunked inference for MultitaskAED models (e.g., "nvidia/canary-1b").
 
 ## Difference between streaming and buffered ASR
 
@@ -9,3 +9,7 @@ While we primarily showcase the defaults of these models in buffering mode, note
 If you reduce your chunk size, the latency for your first prediction is reduced, and the model appears to predict the text with shorter delay. On the other hand, since the amount of information in the chunk is reduced, it causes higher WER.
 
 On the other hand, if you increase your chunk size, then the delay between spoken sentence and the transcription increases (this is buffered ASR). While the latency is increased, you are able to obtain more accurate transcripts since the model has more context to properly transcribe the text.
+
+## Chunked Inference
+
+For MultitaskAED models, we provide a script to perform chunked inference. This script will split the input audio into non-overlapping chunks and perform inference on each chunk. The script will then concatenate the results to provide the final transcript.
diff --git a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
similarity index 96%
rename from examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
rename to examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
index 52d3a86c1018..39b7547923cd 100644
--- a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
+++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -88,7 +88,9 @@ class TranscriptionConfig:
 
     # Chunked configs
     chunk_len_in_secs: float = 40.0  # Chunk length in seconds
-    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    model_stride: int = (
+        8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    )
 
     # Decoding strategy for MultitaskAED models
     decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
@@ -209,7 +211,12 @@ def autocast(*args, **kwargs):
     with autocast(dtype=amp_dtype):
         with torch.no_grad():
             hyps = get_buffered_pred_feat_multitaskAED(
-                frame_asr, model_cfg.preprocessor, model_stride_in_secs, asr_model.device, manifest, filepaths,
+                frame_asr,
+                model_cfg.preprocessor,
+                model_stride_in_secs,
+                asr_model.device,
+                manifest,
+                filepaths,
             )
 
     output_filename, pred_text_attr_name = write_transcription(
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
index a59a2628cd2f..acb499f18ffb 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -80,6 +80,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
index 8f8f7e40e39a..8dd978bb00e4 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -78,6 +78,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
index 69b21b496ddd..9f199c2dd488 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
index 8fd096525e74..c7f83216aa0b 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -84,6 +84,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
index b0965b580d5b..6f356ce91caa 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
index 9c144d22edec..870bb0190c03 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
index 69e4546b77a7..3fc91cc1e436 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -87,6 +87,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
index ea98d13e62da..e99ba69df57a 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
index 2fab24fa6373..3e3d2bf6788e 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 18
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
index 4d5f4dbcbd06..5f6c37288ae9 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
index 47ad5aa458ca..6e7b5e107629 100644
--- a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
+++ b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
@@ -79,6 +79,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index c8372c422e7b..d54ee34c18cd 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -16,7 +16,8 @@
 import glob
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+import time
+from dataclasses import dataclass, field, is_dataclass
 from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
@@ -25,10 +26,12 @@
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (
@@ -82,6 +85,8 @@
   langid: Str used for convert_num_to_words during groundtruth cleaning
   use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
 
+  calculate_rtfx: Bool to calculate the RTFx throughput to transcribe the input dataset.
+
 # Usage
 ASR model can be specified by either "model_path" or "pretrained_name".
 Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
@@ -121,9 +126,9 @@ class TranscriptionConfig:
     pretrained_name: Optional[str] = None  # Name of a pretrained model
     audio_dir: Optional[str] = None  # Path to a directory which contains audio files
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
-    channel_selector: Optional[
-        Union[int, str]
-    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
+    channel_selector: Optional[Union[int, str]] = (
+        None  # Used to select a single channel from multichannel audio, or use average across channels
+    )
     audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
     eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
     presort_manifest: bool = True  # Significant inference speedup on short-form data due to padding reduction
@@ -151,6 +156,7 @@ class TranscriptionConfig:
     allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
     amp: bool = False
     amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
+    compute_dtype: str = "float32"
     matmul_precision: str = "highest"  # Literal["highest", "high", "medium"]
     audio_type: str = "wav"
 
@@ -161,10 +167,19 @@ class TranscriptionConfig:
     ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
 
     # Decoding strategy for RNNT models
+    # enable CUDA graphs for transcription
     rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
+    # Implicit single-turn assuming default role='user' (works with Canary-1B)
+    #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
+    # Explicit single-turn prompt:
+    #  +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes
+    # Explicit multi-turn prompt:
+    #  +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]'
+    prompt: dict = field(default_factory=dict)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
@@ -197,6 +212,8 @@ class TranscriptionConfig:
     allow_partial_transcribe: bool = False
     extract_nbest: bool = False  # Extract n-best hypotheses from the model
 
+    calculate_rtfx: bool = False
+
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
@@ -255,6 +272,14 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     asr_model.set_trainer(trainer)
     asr_model = asr_model.eval()
 
+    if cfg.compute_dtype != "float32" and cfg.amp:
+        raise ValueError("amp=true is mutually exclusive with a compute_dtype other than float32")
+
+    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+
+    if cfg.compute_dtype != "float32":
+        asr_model.to(getattr(torch, cfg.compute_dtype))
+
     # we will adjust this flag if the model does not support it
     compute_timestamps = cfg.compute_timestamps
     compute_langs = cfg.compute_langs
@@ -367,7 +392,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     else:
 
         @contextlib.contextmanager
-        def autocast(dtype=None):
+        def autocast(dtype=None, enabled=True):
             yield
 
     # Compute output filename
@@ -383,10 +408,22 @@ def autocast(dtype=None):
 
     # transcribe audio
 
-    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+    if cfg.calculate_rtfx:
+        total_duration = 0.0
+
+        with open(cfg.dataset_manifest, "rt") as fh:
+            for line in fh:
+                item = json.loads(line)
+                if "duration" not in item:
+                    raise ValueError(
+                        f"Requested calculate_rtfx=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
+                    )
+                total_duration += item["duration"]
 
-    with autocast(dtype=amp_dtype):
+    with autocast(dtype=amp_dtype, enabled=cfg.amp):
         with torch.no_grad():
+            if cfg.calculate_rtfx:
+                start_time = time.time()
             if partial_audio:
                 transcriptions = transcribe_partial_audio(
                     asr_model=asr_model,
@@ -407,7 +444,15 @@ def autocast(dtype=None):
                 override_cfg.augmentor = augmentor
                 override_cfg.text_field = cfg.gt_text_attr_name
                 override_cfg.lang_field = cfg.gt_lang_attr_name
-                transcriptions = asr_model.transcribe(audio=filepaths, override_config=override_cfg,)
+                if hasattr(override_cfg, "prompt"):
+                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
+
+                transcriptions = asr_model.transcribe(
+                    audio=filepaths,
+                    override_config=override_cfg,
+                )
+            if cfg.calculate_rtfx:
+                transcribe_time = time.time() - start_time
 
     if cfg.dataset_manifest is not None:
         logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
@@ -459,6 +504,9 @@ def autocast(dtype=None):
             logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
             logging.info(f"{total_res}")
 
+    if cfg.calculate_rtfx:
+        logging.info(f"Dataset RTFx {(total_duration/transcribe_time)}")
+
     return cfg
 
 
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index c0af8f97146a..446e40714460 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -84,6 +84,7 @@
 from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel
 from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
 from nemo.core.config import TrainerConfig, hydra_runner
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
@@ -100,7 +101,8 @@ class ParallelTranscriptionConfig:
     use_cer: bool = False
 
     # decoding strategy for RNNT models
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig()
+    # Double check whether fused_batch_size=-1 is right
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
diff --git a/examples/audio_tasks/audio_to_audio_eval.py b/examples/audio_tasks/audio_to_audio_eval.py
index 4ac68dfc84e7..ab6623df298d 100644
--- a/examples/audio_tasks/audio_to_audio_eval.py
+++ b/examples/audio_tasks/audio_to_audio_eval.py
@@ -61,6 +61,7 @@
 import json
 import os
 import tempfile
+from collections import defaultdict
 from dataclasses import dataclass, field, is_dataclass
 from typing import List, Optional
 
@@ -101,6 +102,9 @@ class AudioEvaluationConfig(process_audio.ProcessConfig):
     # Metrics to calculate
     metrics: List[str] = field(default_factory=lambda: ['sdr', 'estoi'])
 
+    # Return metric values for each example
+    return_values_per_example: bool = False
+
 
 def get_evaluation_dataloader(config):
     """Prepare a dataloader for evaluation.
@@ -174,6 +178,9 @@ def main(cfg: AudioEvaluationConfig):
     # Setup metrics
     metrics = get_metrics(cfg)
 
+    if cfg.return_values_per_example and cfg.batch_size > 1:
+        raise ValueError('return_example_values is only supported for batch_size=1.')
+
     # Processing
     if not cfg.only_score_manifest:
         # Process audio using the configured model and save in the output directory
@@ -236,6 +243,10 @@ def main(cfg: AudioEvaluationConfig):
 
                 num_files += 1
 
+                if cfg.max_utts is not None and num_files >= cfg.max_utts:
+                    logging.info('Reached max_utts: %s', cfg.max_utts)
+                    break
+
         # Prepare dataloader
         config = {
             'manifest_filepath': temporary_manifest_filepath,
@@ -249,6 +260,8 @@ def main(cfg: AudioEvaluationConfig):
         }
         temporary_dataloader = get_evaluation_dataloader(config)
 
+        metrics_value_per_example = defaultdict(list)
+
         # Calculate metrics
         for eval_batch in tqdm(temporary_dataloader, desc='Evaluating'):
             processed_signal, processed_length, target_signal, target_length = eval_batch
@@ -257,7 +270,9 @@ def main(cfg: AudioEvaluationConfig):
                 raise RuntimeError(f'Length mismatch.')
 
             for name, metric in metrics.items():
-                metric.update(preds=processed_signal, target=target_signal, input_length=target_length)
+                value = metric(preds=processed_signal, target=target_signal, input_length=target_length)
+                if cfg.return_values_per_example:
+                    metrics_value_per_example[name].append(value.item())
 
     # Convert to a dictionary with name: value
     metrics_value = {name: metric.compute().item() for name, metric in metrics.items()}
@@ -277,6 +292,7 @@ def main(cfg: AudioEvaluationConfig):
     # Inject the metric name and score into the config, and return the entire config
     with open_dict(cfg):
         cfg.metrics_value = metrics_value
+        cfg.metrics_value_per_example = dict(metrics_value_per_example)
 
     return cfg
 
diff --git a/examples/audio_tasks/conf/beamforming.yaml b/examples/audio_tasks/conf/beamforming.yaml
index 18e04f0bd12a..3abc4f134e64 100644
--- a/examples/audio_tasks/conf/beamforming.yaml
+++ b/examples/audio_tasks/conf/beamforming.yaml
@@ -44,7 +44,6 @@ model:
     _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
     fft_length: 512 # Length of the window and FFT for calculating spectrogram
     hop_length: 256 # Hop length for calculating spectrogram
-    power: null
 
   decoder:
     _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
diff --git a/examples/audio_tasks/conf/masking.yaml b/examples/audio_tasks/conf/masking.yaml
index c667bec53076..68adca116aa5 100644
--- a/examples/audio_tasks/conf/masking.yaml
+++ b/examples/audio_tasks/conf/masking.yaml
@@ -1,5 +1,3 @@
-# This configuration contains the exemplary values for training a multichannel speech enhancement model with a mask-based beamformer.
-#
 name: "masking"
 
 model:
@@ -44,7 +42,6 @@ model:
     _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
     fft_length: 512 # Length of the window and FFT for calculating spectrogram
     hop_length: 256 # Hop length for calculating spectrogram
-    power: null
 
   decoder:
     _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
diff --git a/examples/audio_tasks/conf/predictive.yaml b/examples/audio_tasks/conf/predictive.yaml
new file mode 100644
index 000000000000..b141ba6fd1ee
--- /dev/null
+++ b/examples/audio_tasks/conf/predictive.yaml
@@ -0,0 +1,130 @@
+name: "predictive_model"
+
+model:
+  type: predictive
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true # normalize the input signal to 0dBFS
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 256
+    random_offset: true
+    normalization_signal: input_signal
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    batch_size: 8
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 1 # single-channel noisy input
+    out_channels: 1 # single-channel estimate
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+    
+  loss:
+    _target_: nemo.collections.asr.losses.MSELoss # computed in the time domain
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: False  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_sisdr
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/audio_tasks/conf/score_based_generative.yaml b/examples/audio_tasks/conf/score_based_generative.yaml
new file mode 100644
index 000000000000..c0b36bd750a2
--- /dev/null
+++ b/examples/audio_tasks/conf/score_based_generative.yaml
@@ -0,0 +1,149 @@
+name: score_based_generative_model
+
+model:
+  type: score_based
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true
+  max_utts_evaluation_metrics: 50 # metric calculation needs full inference and is slow, so we limit to first few files
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 256
+    random_offset: true
+    normalization_signal: input_signal
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    normalize_input: false # load data as is for validation, the model will normalize it for inference
+    batch_size: 4
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel score estimate
+    conditioned_on_time: true
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+
+  sde:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.OrnsteinUhlenbeckVarianceExplodingSDE
+    stiffness: 1.5
+    std_min: 0.05
+    std_max: 0.5
+    num_steps: 1000
+
+  sampler:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.PredictorCorrectorSampler
+    predictor: reverse_diffusion
+    corrector: annealed_langevin_dynamics
+    num_steps: 50
+    num_corrector_steps: 1
+    snr: 0.5
+    
+  loss:
+    _target_: nemo.collections.asr.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_sisdr
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/audio_tasks/speech_enhancement.py b/examples/audio_tasks/speech_enhancement.py
index 250d212d2a25..33a25c1c107c 100644
--- a/examples/audio_tasks/speech_enhancement.py
+++ b/examples/audio_tasks/speech_enhancement.py
@@ -26,25 +26,64 @@
 
 PyTorch Lightning Trainer arguments and args of the model and the optimizer can be added or overriden from CLI
 """
+from enum import Enum
+
 import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.asr.models import EncMaskDecAudioToAudioModel
+from nemo.collections.asr.models.enhancement_models import (
+    EncMaskDecAudioToAudioModel,
+    PredictiveAudioToAudioModel,
+    ScoreBasedGenerativeAudioToAudioModel,
+)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
+class ModelType(str, Enum):
+    """Enumeration with the available model types.
+    """
+
+    MaskBased = 'mask_based'
+    Predictive = 'predictive'
+    ScoreBased = 'score_based'
+
+
+def get_model_class(model_type: ModelType):
+    """Get model class for a given model type.
+    """
+    if model_type == ModelType.MaskBased:
+        return EncMaskDecAudioToAudioModel
+    elif model_type == ModelType.Predictive:
+        return PredictiveAudioToAudioModel
+    elif model_type == ModelType.ScoreBased:
+        return ScoreBasedGenerativeAudioToAudioModel
+    else:
+        raise ValueError(f'Unknown model type: {model_type}')
+
+
 @hydra_runner(config_path="./conf", config_name="masking")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg, resolve=True)}')
 
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
-    model = EncMaskDecAudioToAudioModel(cfg=cfg.model, trainer=trainer)
 
-    # Initialize the weights of the model from another model, if provided via config
+    # Get model class
+    model_type = cfg.model.get('type')
+    if model_type is None:
+        model_type = ModelType.MaskBased
+        logging.warning('model_type not found in config. Using default: %s', model_type)
+
+    logging.info('Get class for model type: %s', model_type)
+    model_class = get_model_class(model_type)
+
+    logging.info('Instantiate model %s', model_class.__name__)
+    model = model_class(cfg=cfg.model, trainer=trainer)
+
+    logging.info('Initialize the weights of the model from another model, if provided via config')
     model.maybe_init_from_pretrained_checkpoint(cfg)
 
     # Train the model
diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
index 83c5a4ba7106..b47c719fef1d 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "openai/clip-vit-large-patch14" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -185,6 +186,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: llama_2 # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index b41f15c384a8..9ec6e51bb004 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -181,13 +182,14 @@ model:
     additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
 
   data:
+    packed_sequence: False
     num_workers: 8
     dataloader_type: cyclic
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
-    image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
     image_aspect_ratio: 'square'
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
index cafee118a8bd..8cef53d5edf6 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
@@ -182,6 +182,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
index c822237f8fc9..b06f4bd8e535 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
@@ -10,8 +10,9 @@ inference:
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images # /path/to/images or /path/to/videos
+  insert_media_token: null # `left` or `right` or `null`
+  media_type: image # `image` or `video` 
 
 trainer:
   devices: 8
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
index add113cdc539..5dfcec776b69 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -193,6 +194,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
new file mode 100644
index 000000000000..8341ff857202
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
@@ -0,0 +1,222 @@
+name: nemo_video_neva
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 10000  # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: nemo_video_neva
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+model:
+  precision: ${trainer.precision}
+
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+
+  # Batch size guideline for different types of dataset
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 2 # will use more micro batches to reach global batch size
+
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # kqv model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_path: null # used in fine-tuning
+
+  # Multimodal configs
+  mm_cfg:
+    llm:
+      from_pretrained:  #path to nemo checkpoint
+      freeze: True
+      model_type: llama_2 # `nvgpt` or `llama_2` supported
+    vision_encoder:
+      from_pretrained: "" # path or name
+      from_hf: True
+      patch_dim: 14
+      crop_size: [336, 336]
+      hidden_size: 1024 # could be found from model but tricky in code
+      vision_select_layer: -2   # default to the last layer
+      class_token_length: 1
+      freeze: True
+    pretrain_mm_mlp_adapter: null # path to pretrained mm adapter
+    mm_mlp_adapter_type: linear
+    use_im_start_end: False
+
+
+  # LLM configs
+  # use GPTModel from megatron.core
+  mcore_gpt: True
+
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: rope
+  num_layers: 32
+  hidden_size: 6144
+  ffn_hidden_size: 24576 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 48
+  init_method_std: 0.0134 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: layernorm1p # Type of normalization layers
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: 'squared-relu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  posistion_embedding_type: 'rope'
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used.
+  use_flash_attention: True
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # model fusions
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+  openai_gelu: False
+  bias_activation_fusion: False
+  megatron_legacy: False
+
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  async_grad_allreduce: False
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+    additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
+
+  data:
+    packed_sequence: False
+    num_workers: 8
+    dataloader_type: cyclic
+    data_path: null
+    lazy_preprocess: True
+    is_multimodal: True
+    media_type: video # currently supported: image or video
+    splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded.
+    num_frames: 8 # selects the number of frames to use from the video
+    sep_token_between_frames: False # TODO: allow usage of separator tokens between frames
+    sep_image_conv_front: False
+    conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
+    image_folder: null
+    video_folder: null
+    image_aspect_ratio: 'square'
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 2e-3
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 140
+      constant_steps: 0
+      min_lr: 2e-4
diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
index b1308a7b0d3c..7c04a7045f00 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
@@ -20,6 +20,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 
 CFG_STRING = """
 trainer:
@@ -41,8 +42,8 @@
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images
+  insert_media_token: null # `left` or `right` or `null`
 
 cluster_type: BCP
 tensor_model_parallel_size: 1
@@ -58,7 +59,7 @@
 
 cfg = OmegaConf.create(CFG_STRING)
 cfg.neva_model_file = "/path/to/llava-v1.5-7b.nemo"
-model, image_processor = create_neva_model_and_processor(cfg)
+model, image_processor, _ = create_neva_model_and_processor(cfg)
 
 
 def predict(prompt, image_base64=None):
diff --git a/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py b/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py
new file mode 100644
index 000000000000..d3fd8c644afb
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py
@@ -0,0 +1,245 @@
+"""Script to query Mixtral-8x7B as a judge via NGC API for evaluation"""
+import argparse
+import json
+import math
+import os
+from collections import defaultdict
+
+import numpy as np
+import requests
+import shortuuid
+from tqdm import tqdm
+
+"""Usage: (for image inference)
+API_TOKEN=xxx python3 --model-name-list name-of-model-1 name-of-model-2
+                      --media-type image
+                      --question-file path/to/prompts.jsonl
+                      --responses-list path/to/responses-1.jsonl path/to/responses-2.jsonl
+                      --answers-dir path/to/desired/preprocessed/answers/dir
+                      --context-file path/to/context.jsonl
+                      --rule-file path/to/rule.json
+                      --output path/to/desired/output.json
+"""
+
+invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/8f4118ba-60a8-4e6b-8574-e38a4067a4a3"
+
+API_TOKEN = os.getenv("API_TOKEN", "")  # ADD NGC API TOKEN HERE
+
+headers = {
+    "Authorization": f"Bearer {API_TOKEN}",
+    "accept": "text/event-stream",
+    "content-type": "application/json",
+}
+
+
+def summarize(review_files):
+    for review_file in sorted(review_files):
+        scores = defaultdict(list)
+        with open(review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, round(stats[1] / stats[0] * 100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+            print(k, round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
+
+
+def get_eval(content: str, max_tokens: int):
+    payload = {
+        "messages": [
+            {
+                'role': 'system',
+                'content': 'You are a helpful and precise assistant for checking the quality of the answer.',
+            },
+            {'role': 'user', 'content': content,},
+        ],
+        "temperature": 0.2,
+        "top_p": 0.7,
+        "max_tokens": max_tokens,
+        "seed": 42,
+        "stream": True,
+    }
+    response = requests.post(invoke_url, headers=headers, json=payload, stream=True)
+    output = ""
+    for line in response.iter_lines():
+        if line:
+            try:
+                res = json.loads(line.decode("utf-8").split("data: ")[1])
+            except:
+                continue
+            output += res['choices'][0]['delta']['content']
+    print(output)
+    return output
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split()
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e.messsage)
+        print('error', review)
+        return [-1, -1]
+
+
+def generate_prompt(args, answer_list):
+    f_q = open(os.path.expanduser(args.question_file))
+    f_ans1 = open(os.path.expanduser(answer_list[0]))
+    f_ans2 = open(os.path.expanduser(answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule_file), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context_file))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (
+            f'[Context]\n{cap_str}\n\n'
+            f'[Question]\n{ques["text"]}\n\n'
+            f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+            f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+            f'[System]\n{prompt}\n\n'
+        )
+        cur_js = {
+            'id': idx + 1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category,
+        }
+        if idx >= len(cur_reviews):
+            print(content)
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
+
+    return args.output
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def preprocess(args, response_file, model_name):
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    responses = [json.loads(r) for r in open(os.path.expanduser(response_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    responses = get_chunk(responses, args.num_chunks, args.chunk_idx)
+    base, ext = os.path.splitext(os.path.basename(response_file))
+    answer_file = os.path.join(args.answers_dir, f'{base}_answer{ext}')
+    answers_file = os.path.expanduser(answer_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line, resp in tqdm(zip(questions, responses), total=len(questions)):
+        idx = line["question_id"]
+        resp_key = "response_id"
+        resp_text_key = "response"
+        if resp_key not in resp:
+            resp_key = "question_id"
+            resp_text_key = "text"
+        resp_idx = resp[resp_key]
+
+        if int(idx) == int(resp_idx):
+            # image_file = line[args.media_type]
+            qs = line["text"]
+            cur_prompt = qs
+            outputs = resp[resp_text_key]
+            ans_id = shortuuid.uuid()
+            ans_file.write(
+                json.dumps(
+                    {
+                        "question_id": idx,
+                        "prompt": cur_prompt,
+                        "text": outputs,
+                        "answer_id": ans_id,
+                        "model_id": model_name,
+                        "metadata": {},
+                    }
+                )
+                + "\n"
+            )
+            ans_file.flush()
+    ans_file.close()
+
+    return answer_file
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name-list", nargs='+', default=[])
+    parser.add_argument("--media-type", type=str, default="image")
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument('--responses-list', nargs='+', default=[])
+    parser.add_argument("--answers-dir", type=str, default="answers")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument('--context-file', type=str, default="context.jsonl")
+    parser.add_argument('--rule-file', type=str, default="rule.json")
+    parser.add_argument('--output', type=str, default="output.json")
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    answer_list = []
+    for response, model_name in zip(args.responses_list, args.model_name_list):
+        answer = preprocess(args, response, model_name)
+        answer_list.append(answer)
+
+    review = generate_prompt(args, answer_list)
+
+    summarize([review])
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index 8ea267ac8116..17bda5725eb4 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -45,8 +45,8 @@
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images
+  insert_media_token: null # `left` or `right` or `null`
 
 cluster_type: BCP
 tensor_model_parallel_size: 1
@@ -77,11 +77,12 @@ def eval_model(args):
     cfg = OmegaConf.create(CFG_STRING)
     cfg.neva_model_file = args.model_path
     cfg.base_model_file = args.model_base
-    cfg.inference.images_base_path = args.image_folder
+    cfg.inference.media_base_path = args.image_folder
     cfg.tensor_model_parallel_size = args.tp
-    cfg.trainer.devices = args.tp
+    cfg.pipeline_model_parallel_size = args.pp
+    cfg.trainer.devices = args.tp * args.pp
 
-    model, image_processor = create_neva_model_and_processor(cfg)
+    model, image_processor, _ = create_neva_model_and_processor(cfg)
     length_params: LengthParam = {
         "max_length": cfg.inference.tokens_to_generate,
         "min_length": cfg.inference.min_tokens_to_generate,
@@ -102,7 +103,8 @@ def eval_model(args):
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
     answers_file = os.path.expanduser(args.answers_file)
     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
-    ans_file = open(answers_file, "w")
+    if is_global_rank_zero():
+        ans_file = open(answers_file, "w")
     for i, line in enumerate(tqdm(questions, disable=(not is_global_rank_zero()))):
         idx = line["id"]
         question = line['conversations'][0]
@@ -111,7 +113,7 @@ def eval_model(args):
 
         if 'image' in line:
             cur_prompt = qs = '<image>' + cur_prompt
-            line['image'] = image_processor(os.path.join(cfg.inference.images_base_path, line['image']))
+            line['image'] = image_processor(os.path.join(cfg.inference.media_base_path, line['image']))
 
         if args.single_pred_prompt:
             qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
@@ -123,7 +125,8 @@ def eval_model(args):
             sampling_params=sampling_params,
             inference_config=cfg,
         )
-        # import  pdb; pdb.set_trace()
+        if responses is None:
+            continue
         outputs = responses[0]["clean_response"]
 
         # prompt for answer
@@ -139,22 +142,24 @@ def eval_model(args):
             outputs = responses[0]["clean_response"]
             outputs = outputs_reasoning + '\n The answer is ' + outputs
 
-        ans_id = shortuuid.uuid()
-        ans_file.write(
-            json.dumps(
-                {
-                    "question_id": idx,
-                    "prompt": cur_prompt,
-                    "text": outputs,
-                    "answer_id": ans_id,
-                    "model_id": args.model_path,
-                    "metadata": {},
-                }
+        if is_global_rank_zero():
+            ans_id = shortuuid.uuid()
+            ans_file.write(
+                json.dumps(
+                    {
+                        "question_id": idx,
+                        "prompt": cur_prompt,
+                        "text": outputs,
+                        "answer_id": ans_id,
+                        "model_id": args.model_path,
+                        "metadata": {},
+                    }
+                )
+                + "\n"
             )
-            + "\n"
-        )
-        ans_file.flush()
-    ans_file.close()
+            ans_file.flush()
+    if is_global_rank_zero():
+        ans_file.close()
 
 
 if __name__ == "__main__":
@@ -164,8 +169,8 @@ def eval_model(args):
     parser.add_argument("--image-folder", type=str, default="")
     parser.add_argument("--question-file", type=str, default="tables/question.json")
     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
-    parser.add_argument("--conv-mode", type=str, default="llava_v0")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--num-chunks", type=int, default=1)
     parser.add_argument("--chunk-idx", type=int, default=0)
     parser.add_argument("--temperature", type=float, default=0.2)
diff --git a/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
new file mode 100644
index 000000000000..8891a8e9d208
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo legacy checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--gpus_per_node", type=int, required=False, default=8)
+    parser.add_argument("--num_nodes", type=int, required=False, default=1)
+    parser.add_argument(
+        "--precision",
+        type=str,
+        required=False,
+        default='bf16-mixed',
+        choices=['32-true', '16-mixed', 'bf16-mixed'],
+        help="Precision value for the trainer that matches with precision of the ckpt",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    args = get_args()
+    cfg = {
+        'trainer': {
+            'devices': args.gpus_per_node,
+            'num_nodes': args.num_nodes,
+            'accelerator': 'gpu',
+            'precision': args.precision,
+        },
+        'model': {
+            'native_amp_init_scale': 2**32,
+            'native_amp_growth_interval': 1000,
+            'hysteresis': 2,
+            'gradient_as_bucket_view': True,
+        },
+        'cluster_type': 'BCP',
+    }
+    cfg = OmegaConf.create(cfg)
+
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    save_restore_connector = NLPSaveRestoreConnector()
+    if os.path.isdir(args.input_path):
+        save_restore_connector.model_extracted_dir = args.input_path
+
+    model = MegatronNevaModel.restore_from(
+        restore_path=args.input_path,
+        trainer=trainer,
+        save_restore_connector=save_restore_connector,
+        strict=False,
+    )
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
index bd3f975e4d54..179415392391 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -20,6 +20,7 @@
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.core.config import hydra_runner
+from nemo.utils.get_rank import is_global_rank_zero
 
 
 try:
@@ -49,7 +50,7 @@ def __getitem__(self, idx):
 
 @hydra_runner(config_path="conf", config_name="neva_inference")
 def main(cfg) -> None:
-    model, image_processor = create_neva_model_and_processor(cfg)
+    model, image_processor, video_processor = create_neva_model_and_processor(cfg)
 
     length_params: LengthParam = {
         "max_length": cfg.inference.tokens_to_generate,
@@ -71,20 +72,26 @@ def main(cfg) -> None:
     with open(cfg.prompt_file, 'r') as f:
         lines = f.readlines()
 
-    insert_image_token = cfg.inference.get("insert_image_token", None)
+    media_type_token = cfg.inference.get("media_type", "image")
+    media_token = f"<{media_type_token}>"
+
+    insert_media_token = cfg.inference.get("insert_media_token", None)
     final_prompts = []
     for line in lines:
         prompt_dict = json.loads(line)
         assert 'prompt' in prompt_dict or 'text' in prompt_dict
         if 'prompt' not in prompt_dict:
             prompt_dict['prompt'] = prompt_dict['text']
-        if insert_image_token == 'left':
-            prompt_dict['prompt'] = '<image>' + prompt_dict['prompt']
-        elif insert_image_token == 'right':
-            prompt_dict['prompt'] = prompt_dict['prompt'] + '<image>'
+        if insert_media_token == 'left':
+            prompt_dict['prompt'] = media_token + prompt_dict['prompt']
+        elif insert_media_token == 'right':
+            prompt_dict['prompt'] = prompt_dict['prompt'] + media_token
         if 'image' in prompt_dict:
             prompt_dict['image_path'] = prompt_dict['image']
-            prompt_dict['image'] = image_processor(os.path.join(cfg.inference.images_base_path, prompt_dict['image']))
+            prompt_dict['image'] = image_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['image']))
+        if 'video' in prompt_dict:
+            prompt_dict['video_path'] = prompt_dict['video']
+            prompt_dict['video'] = video_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['video']))
         final_prompts.append(prompt_dict)
 
     responses = model.generate(
@@ -121,22 +128,29 @@ def forward_loop():
         )
     # ============== Quantization End =========================
 
-    results = []
-    for response, prompt in zip(responses, final_prompts):
-        prompt['full_text'] = response["clean_text"]
-        prompt['text'] = response["clean_response"]
-        prompt['model_id'] = cfg.neva_model_file
-        if 'image_path' in prompt:
-            prompt['image'] = prompt.pop('image_path')
-        if 'answer_id' not in prompt:
-            prompt['answer_id'] = 0
-        if 'metadata' not in prompt:
-            prompt['metadata'] = {}
-        results.append(prompt)
-
-    with open(cfg.output_file, 'w') as f:
-        for result in results:
-            f.write(json.dumps(result) + '\n')
+    # PP middle stages do not yield any responses
+    if responses is None:
+        return
+
+    if is_global_rank_zero():
+        results = []
+        for response, prompt in zip(responses, final_prompts):
+            prompt['full_text'] = response["clean_text"]
+            prompt['text'] = response["clean_response"]
+            prompt['model_id'] = cfg.neva_model_file
+            if 'image_path' in prompt:
+                prompt['image'] = prompt.pop('image_path')
+            if 'video_path' in prompt:
+                prompt['video'] = prompt.pop('video_path')
+            if 'answer_id' not in prompt:
+                prompt['answer_id'] = 0
+            if 'metadata' not in prompt:
+                prompt['metadata'] = {}
+            results.append(prompt)
+
+        with open(cfg.output_file, 'w') as f:
+            for result in results:
+                f.write(json.dumps(result) + '\n')
 
 
 if __name__ == '__main__':
diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
new file mode 100644
index 000000000000..60f882fa9821
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example Usage:
+--------------
+This script preprocesses a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters.
+
+Command:
+python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \
+ --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \
+ --image_folder=/path/to/LLaVA-Instruct-150K/images \
+ --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \
+ --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_4096_336_v1 \
+ --max_seq_length=12288 \
+ --packing_algorithm=first_fit_shuffle \
+ --hf_vision_encoder=openai/clip-vit-large-patch14-336 \
+ --conv_template=v1 \
+ --image_aspect_ratio=pad \
+ --seed=42
+
+Parameters:
+-----------
+--data_path: Path to the dataset file in JSON format.
+--image_folder: Directory containing the images referenced in the dataset.
+--tokenizer_path: Path to the tokenizer model.
+--output_dir: Directory where the processed dataset will be stored.
+--max_seq_length: The maximum sequence length of the model.
+--packing_algorithm: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'.
+--hf_vision_encoder: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'.
+--conv_template: Template for data conversion. Default is 'plain', with 'v1' as an alternative.
+--image_aspect_ratio: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio.
+--seed: Seed for random operations in 'first_fit_shuffle'.
+--hparams_file: Optional path to a YAML file containing additional hyperparameters.
+"""
+
+import collections
+import os
+import random
+import re
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+import torch
+from megatron.core.datasets.indexed_dataset import IndexedDataset, IndexedDatasetBuilder, get_bin_path, get_idx_path
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from nemo.collections.multimodal.data.neva.neva_dataset import make_supervised_data_module
+from nemo.collections.multimodal.parts.utils import create_image_processor
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle', 'shuffle_and_pack']
+
+
+def first_fit(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit algorithm, by integrating the search
+    and assignment within the same function. It moves bins that can no longer fit the minimum sequence length
+    to a completed bins list, avoiding direct modification of the bins list during iteration.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    min_seq_len = min(seq_lens)  # Find the minimum sequence length
+    completed_bins = []  # Initialize the completed bins list
+    bins = []  # Initialize the bins list to store active bins
+
+    for s in tqdm(seq_lens):  # Iterate through each sequence length
+        found_bin = False
+        for i, abin in enumerate(bins[:]):  # Iterate over a shallow copy of bins
+            if sum(abin) + min_seq_len > max_seq_length:
+                completed_bins.append(abin)  # Add to completed bins
+                bins[i] = 'TO_REMOVE'  # Mark this bin for removal
+                continue
+            if sum(abin) + s <= max_seq_length:  # Check if the bin can fit the sequence
+                bins[i].append(s)  # If so, add the sequence to this bin
+                found_bin = True
+                break
+
+        if not found_bin:  # If no existing bin can fit the sequence
+            bins.append([s])  # Open a new bin for this sequence
+
+        # Clean up bins marked 'TO_REMOVE'
+        bins = [bin for bin in bins if bin != 'TO_REMOVE']
+
+    # Combine completed bins with any remaining active bins
+    all_bins = completed_bins + bins
+    return all_bins
+
+
+def chunkify(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def parallel_first_fit(seq_lens, max_seq_length, chunk_size, num_workers):
+    """
+    Assigns sequences to bins in parallel using the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+    - chunk_size: Size of chunks to divide seq_lens into for parallel processing.
+    - num_workers: Number of worker threads to use in the ThreadPoolExecutor.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    # Split the sequence lengths into chunks
+    chunks = list(chunkify(seq_lens, chunk_size))
+
+    # Function to process each chunk
+    def process_chunk(chunk):
+        return first_fit(chunk, max_seq_length)
+
+    bins = []  # This will hold the final bins
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Submit each chunk to the executor
+        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
+
+        # As each future completes, combine its bins with the final bins
+        for future in as_completed(futures):
+            bins.extend(future.result())
+
+    return bins
+
+
+def first_fit_decreasing(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit Decreasing algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    sorted_seq_lens = sorted(seq_lens, reverse=True)
+    return first_fit(sorted_seq_lens, max_seq_length)
+
+
+def first_fit_shuffle(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using a shuffled version of the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = seq_lens[:]
+    np.random.shuffle(shuffled_seq_lens)
+    return parallel_first_fit(shuffled_seq_lens, max_seq_length, 20000, 32)
+
+
+def shuffle_and_pack(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins with shuffling, trying to maximize the packing efficiency.
+    After shuffling the sequences, they will be added to one bin in order. Once the bin cannot
+    take more sequences, we will move on to the next bin.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = np.array(seq_lens)
+    np.random.shuffle(shuffled_seq_lens)
+    bins = [[]]
+    cur_bin_total = 0
+    for s in tqdm(shuffled_seq_lens):
+        if cur_bin_total + s <= max_seq_length:
+            bins[-1].append(s)
+            cur_bin_total += s
+        else:
+            bins.append([s])
+            cur_bin_total = s
+    return bins
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--data_path", type=str)
+    parser.add_argument("--image_folder", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument('--output_dir', required=True, type=str)
+    parser.add_argument("--max_seq_length", default=4096, type=int)
+    parser.add_argument('--packing_algorithm', default='first_fit_shuffle', choices=PACKING_ALGOS, type=str)
+    parser.add_argument("--hf_vision_encoder", default='openai/clip-vit-large-patch14-336', type=str)
+    parser.add_argument("--conv_template", default='plain', type=str)
+    parser.add_argument("--image_aspect_ratio", default='square', type=str)
+    parser.add_argument('--seed', default=0, type=int, help="Seed for shuffling, used with first_fit_shuffle.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), '../conf/llava_config.yaml'),
+        required=False,
+        help="Path to the hparams file.",
+    )
+    return parser.parse_args()
+
+
+def pack_sequence(args, seq_lens):
+    """
+    Packs sequences according to the specified algorithm in args.
+
+    Parameters:
+    - args: Command line arguments.
+    - seq_lens: List of sequence lengths.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    packing_fn = globals()[args.packing_algorithm]
+    bins = packing_fn(seq_lens, args.max_seq_length)
+    return bins
+
+
+def main():
+    torch.multiprocessing.set_sharing_strategy('file_system')
+
+    args = get_args()
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.model.mm_cfg.vision_encoder.from_pretrained = args.hf_vision_encoder
+    nemo_config.model.data.data_path = args.data_path
+    nemo_config.model.data.image_folder = args.image_folder
+    nemo_config.model.data.conv_template = args.conv_template
+    nemo_config.model.data.image_aspect_ratio = args.image_aspect_ratio
+
+    tokenizer = get_nmt_tokenizer(
+        library="sentencepiece",
+        tokenizer_model=args.tokenizer_path,
+    )
+    image_processor = create_image_processor(nemo_config.model.mm_cfg)
+    train_ds = make_supervised_data_module(
+        tokenizer=tokenizer, image_processor=image_processor, model_cfg=nemo_config.model
+    )["train_dataset"]
+    train_dl = DataLoader(train_ds, num_workers=32, collate_fn=None, shuffle=False)
+    # Example shape: {'tokens': torch.Size([1, 344]), 'labels': torch.Size([1, 344]), 'image': torch.Size([1, 1, 3, 224, 224])}
+
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    logging.info(f"Output directory: {output_dir}")
+
+    prefix_path = f"{output_dir}/packed_seq_dataset"
+    # Original Datasets to Sequence Lengths Files
+    builders = {}
+    for item_dict in tqdm(train_dl, desc="Building indexed datasets"):
+        item_dict = {k: v[0] for k, v in item_dict.items()}
+        seq_len = len(item_dict['tokens'])
+        if seq_len in builders:
+            builder = builders[seq_len]
+        else:
+            builder_path = get_bin_path(f"{prefix_path}/seqlen_{seq_len}")
+            logging.info(f"Creating builder for sequence length {seq_len} at {builder_path}")
+            builder = IndexedDatasetBuilder(builder_path, dtype=np.float32, multimodal=True)
+            builders[seq_len] = builder
+        builder.add_item(item_dict['tokens'])
+        builder.add_item(item_dict['labels'])
+        builder.add_item(item_dict['image'], 1)
+        builder.end_document()
+        del item_dict
+
+    for seq_len, builder in builders.items():
+        idx_path = get_idx_path(f"{prefix_path}/seqlen_{seq_len}")
+        logging.info(f"Finalizing builder for sequence length {seq_len} at {idx_path}")
+        builder.finalize(idx_path)
+
+    # Packing Sequences into Bins
+    files = os.listdir(f"{output_dir}/packed_seq_dataset")
+    pattern = rf"seqlen_(\d+).bin"
+    seq_len_list = []
+    for file in files:
+        match = re.match(pattern, file)
+        if match:
+            seq_len = int(match.group(1))
+            seq_len_list.append(seq_len)
+
+    aggregated_seq_lens = []
+    doc_pop_order = {}
+    indexed_datasets = {}
+    for seq_len in seq_len_list:
+        dataset_path = f"{prefix_path}/seqlen_{seq_len}"
+        dataset = IndexedDataset(dataset_path, multimodal=True)
+        aggregated_seq_lens.extend([seq_len] * (len(dataset.document_indices) - 1))
+        doc_pop_order[seq_len] = list(np.random.permutation(len(dataset.document_indices) - 1))
+        indexed_datasets[seq_len] = dataset
+
+    logging.info("Getting bins")
+    bins = pack_sequence(args, aggregated_seq_lens)
+    logging.info("Finished getting bins")
+
+    num_bins = len(bins)
+    avg_bins_len = sum([len(x) for x in bins]) / num_bins
+    avg_bins_sum = sum([sum(x) for x in bins]) / num_bins
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+    # Reading Sequence Lengths and Packing into New Files
+    final_builder_path = get_bin_path(f"{prefix_path}")
+    logging.info(f"Creating final builder at {final_builder_path}")
+    final_builder = IndexedDatasetBuilder(final_builder_path, dtype=np.float32, multimodal=True)
+
+    for assignment in tqdm(bins, desc="Building final dataset"):
+        packed_items = collections.defaultdict(list)
+        packed_items["seq_indices"] = [0]
+        for seq_len in assignment:
+            doc_index = doc_pop_order[seq_len].pop()
+            doc_start = indexed_datasets[seq_len].document_indices[doc_index]
+            doc_end = indexed_datasets[seq_len].document_indices[doc_index + 1]
+            item_dict = {
+                "tokens": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[0]),
+                "labels": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[1]),
+                "image": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[2]),
+            }
+            for key in ["tokens", "labels", "image"]:
+                packed_items[key].append(item_dict[key])
+            packed_items["seq_indices"].append(packed_items["seq_indices"][-1] + seq_len)
+
+        for key in ["seq_indices", "tokens", "labels", "image"]:
+            final_builder.add_item(
+                torch.tensor(packed_items[key]) if key == "seq_indices" else torch.cat(packed_items[key], dim=0),
+                1 if key == "image" else 0,
+            )
+        final_builder.end_document()
+
+    idx_path = get_idx_path(f"{prefix_path}")
+    logging.info(f"Finalizing final builder at {idx_path}")
+    final_builder.finalize(idx_path)
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/speech_llm/README.md b/examples/multimodal/speech_llm/README.md
new file mode 100644
index 000000000000..b6a9c7486331
--- /dev/null
+++ b/examples/multimodal/speech_llm/README.md
@@ -0,0 +1,189 @@
+# Modular SpeechLLM
+
+This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], etc). 
+
+## Requirements
+You will need to install this specific branch of NeMo, or use the provided Dockerfile in the root directory of this repository to build a Docker image with all the necessary dependencies.
+
+## Architecture
+
+In general, there're three main components of a modular SpeechLLM: 
+- An audio encoder that processes the input audio and produces a sequence of audio embeddings.
+- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained large language model (LLM).
+- A pretrained large language model (LLM) that processes embeddings from the modality adapter as well as token embeddings of input prompt, and produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM.
+- The LLM produces text outputs based on the concatenated input audio and text embedding.
+
+## Usage
+
+### Input Format
+
+You'll need to prepare data in the NeMo manifest format, where each line is a python dictionary with some keys, for example:
+```
+{
+    "audio_filepath": "path/to/audio.wav",
+    "offset": 0.0, # offset of the audio in seconds, this is an optional field
+    "duration": 10.0 , # duration of the audio in seconds, can set to `None` to load the whole audio
+    "context": "what is the transcription of the audio?", # text prompt for the audio, see below for more details
+    "answer": "the transcription of the audio", # optional for inference, default to "na" in dataloader
+}
+```
+
+The `context` field in the manifest is optional, and you can put a list of context in a context file (one context for each line) then set `++model.data.train_ds.context_file=<path to to context file>` to ask the dataloader to randomly pick a context from the file for each audio sample. This is useful for training with multiple prompts for the same task. If neither `context` field nor `context_file` is provided, the dataloader will use a default context `what does the audio mean?` for all audios. During inference, it is recommended to have the `context` field in the manifest. 
+
+#### **Customizing the fields to use**
+
+You can also use other fields in the manifest to replace the `context` and `answer`fields, but you'll also need to change the `prompt_template` to use the new field names. For example, if you desire to use the new fields `input_text` and `output_text`, you need to set:
+```bash
+++model.data.train_ds.context_key=input_text \
+++model.data.train_ds.answer_key=output_text \
+++model.data.train_ds.prompt_template="'Q: {input_text}\nA: {output_text}'"
+```
+Note that there're single quotes around the prompt template (to avoid hydra errors), and the field names are wrapped in curly braces.
+
+#### **Customizing the input format**
+
+If you would like to use multiple audios, you can set the `audio_filepath` to be a list of audio file paths, and specify the location of each audio by using a special `audio_locator` string in the context. The choice of `audio_locator` should also be passed into the config. For example, if you have a manifest item like this:
+```
+{
+    "audio_filepath": ["path/to/audio1.wav", "path/to/audio2.wav"],
+    "context": "what is the transcription of the [audio] and [audio]?", # text prompt for the audio, see below for more details
+    "answer": "the transcription of the audio1 and audio2", # optional for inference, default to "na" in dataloader
+}
+```
+You can set the `audio_locator` to be `[audio]` in the config:
+```bash
+++model.data.train_ds.audio_locator='[audio]'
+```
+
+By using `audio_locator`, the dataloader will replace the `audio_locator` in the context with the corresponding audio features extracted for each audio. You need to make sure that the number of audio locators in the context matches the number of audio files in the `audio_filepath` field. 
+
+### Training
+
+There are several configs for training a SpeechLLM:
+- `conf/modular_audio_gpt_config_peft.yaml`: a config for training a SpeechLLM with PEFT (e.g., LoRA), where you don't want to tune the whole LLM but still want to adapt the LLM to your needs.
+- `conf/modular_audio_gpt_config_sft.yaml`: a config for training a SpeechLLM without PEFT, where you might want to tune the whole LLM or simply freeze it and use as is.
+- `conf/modular_audio_gpt_multi_enc_config_peft.yaml`: a config for training a SpeechLLM with multiple audio encoders and PEFT, where you can add speaker embeddings to the audio embeddings. Currently only TitaNet is supported as the speaker encoder.
+
+With any config, you can set the following flags to control which components to train or freeze:
+- `model.freeze_llm` # Generally set to `True` unless you want to fine-tune the whole LLM.
+- `model.freeze_audio_encoder` # Generally set to `False` unless you want to freeze the audio encoder.
+- `model.freeze_modality_adapter` # Generally set to `False` since we want to train the modality adapter.
+
+In addition to the config file, you will also need to prepare the audio encoder and the LLM as `*.nemo` files.
+
+To train a SpeechLLM that uses LoRA, you can run the following script:
+```bash
+MEGATRON_MODEL=/path/to/megatron-model.nemo
+ASR_MODEL=/path/to/audio-model.nemo  # only the encoder part will be loaded. e.g, stt_en_fastconformer_transducer_large.nemo 
+
+TRAIN_MANIFESTS="[/data/train_1.json,/data/train_2.json]"
+VAL_MANIFESTS="[/data/dev_1.json,/data/dev_2.json]"
+VAL_NAMES="[dev-1,dev-2]"  # names to display when logging validation results for each dataset
+
+CUDA_VISIBLE_DEVICES="0,1" python modular_audio_gpt_train.py --config-path="./conf" --config-name "modular_audio_gpt_config_peft" \
+    trainer.devices=-1 \
+    model.freeze_audio_encoder=True \
+    model.freeze_llm=True \
+    model.global_batch_size=4 \  # global_batch_size = micro_batch_size * num_gpus_per_node * num_nodes * accumulate_grad_batches
+    model.micro_batch_size=2 \  # micro_batch_size = batch_size_per_gpu
+    model.pretrained_audio_model=$ASR_MODEL \
+    model.restore_from_path=$MEGATRON_MODEL \
+    model.data.train_ds.manifest_filepath=$TRAIN_MANIFESTS \
+    model.data.validation_ds.manifest_filepath=$VAL_MANIFESTS \
+    ++model.data.validation_ds.names=$VAL_NAMES \
+```
+
+You can also use tarred datasets for faster training by converting normal NeMo datasets to tarred datasets using this [script](https://github.com/NVIDIA/NeMo/blob/main/scripts/speech_recognition/convert_to_tarred_audio_dataset.py) and follow the same dataset setting as shown in the script. Also, `accumulate_grad_batches` is automatically set by the model based on `global_batch_size` and `micro_batch_size`, so there's no need to manually calculate and set `trainer.accumulate_grad_batches`.
+
+
+#### **Multi-task Training**
+
+In order to use a context file, you can set `++model.data.train_ds.context_file=<path to to context file>` in the command line or use multiple context files with `++model.data.train_ds.context_file=[<path to to context file1>,<path to context file2>,...]`. If the number of context files is equal to the number of provided datasets, the dataloader will assigne each context file to a dataset. Otherwise, the dataloader will randomly pick a context file from all provided context files for each audio sample. Using multiple context files is useful for training with multiple tasks, where each task has its own set of prompts. Meanwhile, you can control the weights for different tasks/datasets by using concatentated tarred datasets, where you can assign weights to datasets by:
+```
+++model.data.train_ds.is_tarred=True \
+++model.data.train_ds.is_concat=True \
+++model.data.train_ds.manifest_filepath=[/path/to/data1/tarred_audio_manifest.json,/path/to/data2/tarred_audio_manifest.json] \
+++model.data.train_ds.tarred_audio_filepaths=[/path/to/data1/audio__OP_0..1023_CL_.tar,/path/to/data2/audio__OP_0..1023_CL_.tar] \
+++model.data.train_ds.concat_sampling_technique='random' \
+++model.data.train_ds.concat_sampling_probabilities=[0.4,0.6] \
+```
+
+#### **Available Audio Encoders**
+
+Currently all NeMo ASR models are supported, others may also work if they have an `encoder` attribute that returns a sequence of audio embeddings, and a `preprocessor` that takes raw audios and returns a sequence of features for the encoder. The model should also have a `cfg` attribute that returns a `omegaconf.DictConfig` object of model configuration. In addition to a local model, you can also set `pretrained_audio_model` to a model from NGC (e.g., `stt_en_fastconformer_transducer_large`) or Huggingface (e.g., `nvidia/parakeet-rnnt-1.1b`), and the script will download the model and use it for training.
+
+
+### Inference
+
+The script you need to perform inference is `modular_audio_gpt_eval.py`, and the corresponding config file is `conf/modular_audio_gpt_config_eval.yaml`, where you mainly need to set the `model.data.test_ds` fields as well as paths to the checkpoints.
+
+#### **Inference with Intermediate Checkpoints**
+
+If you want to perform inference with intermediate checkpoints, where there's no single NeMo checkpoint file that contains all the model parameters, you can use the following script to load each component from its own checkpoint file and perform inference:
+
+```bash
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ALM_DIR=/path/to/nemo_experiments/job_name
+# below is the path to the config used during training
+ALM_YAML=$ALM_DIR/version_0/hparams.yaml
+# this checkpoint file only contains the trainable params, the backslash is used to avoid hyrda parsing error
+ALM_CKPT="$ALM_DIR/checkpoints/AudioGPT--validation_wer\=0.2-step\=100000-epoch\=0-last.ckpt"  
+
+TEST_MANIFESTS="[/data/test_1.json,/data/test_2.json]"
+TEST_NAMES="[test-1,test-2]"
+
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.restore_from_path=$MEGATRON_CKPT \
+    model.peft.restore_from_path=$ALM_CKPT \
+    model.peft.restore_from_hparams_path=$ALM_YAML \
+    model.data.test_ds.manifest_filepath=$TEST_MANIFESTS \
+    model.data.test_ds.names=$TEST_NAMES \
+    model.data.test_ds.metric.name="bleu" \
+    model.data.test_ds.global_batch_size=8 \
+    model.data.test_ds.micro_batch_size=8 \
+    model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir=${ALM_DIR}
+```
+
+If you froze the audio encoder during training, you will also need to add the following line to the above script:
+```bash
+++model.pretrained_audio_model=/path/to/audio/model.nemo
+```
+
+If you want to save the intermediate checkpoints to a single NeMo checkpoint file, you can add the following line to the above script:
+```bash
+++save_to_nemo=/path/to/save/model.nemo
+```
+
+#### **Inference with Complete SpeechLLM Checkpoints**
+
+If you want to load a trained SpeechLLM from cloud, you can use the following script:
+```bash
+TEST_MANIFESTS="[/data/test_1.json,/data/test_2.json]"
+TEST_NAMES="[test-1,test-2]"
+
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.from_pretrained="speechllm_fc_llama2_7b" \
+    model.data.test_ds.manifest_filepath=$TEST_MANIFESTS \
+    model.data.test_ds.names=$TEST_NAMES \
+    model.data.test_ds.global_batch_size=8 \
+    model.data.test_ds.micro_batch_size=8 \
+	model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir="./test_outputs"
+```
+
+If you have a local `.nemo` file, you can use `model.restore_from_path=/path/to/model.nemo` to replace the line `model.from_pretrained="speechllm_fc_llama2_7b"` in the above example.
+
+
+## Reference
+[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24.
\ No newline at end of file
diff --git a/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml
new file mode 100644
index 000000000000..6145a1a4c462
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml
@@ -0,0 +1,329 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_bestow_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+model_target: nemo.collections.multimodal.speech_llm.models.modular_models.CrossAttendModularAudioGPTModel
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  ## Legacy batch_size configuration
+  # When used with lhotse, the batch composition is decided by dataloader configs
+  # and batch size here is only used for deciding gradient accumulation.
+  # gradient accumulation = global_batch_size / micro_batch_size / data_parallel_size
+  # where data_parallel_size = num_nodes * num_gpus / TP_size
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+    xattn:
+      target: nemo.collections.multimodal.speech_llm.modules.perception_modules.TransformerCrossAttention
+      num_attention_heads: 8
+      attn_score_dropout: 0.1
+      attn_layer_dropout: 0.1
+      ffn_dropout: 0.1
+      hidden_act: "relu"
+      pre_ln: true
+      pre_ln_final_layer_norm: true
+
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{context}[/INST] {answer}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: null
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
new file mode 100644
index 000000000000..62b9030b4708
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
@@ -0,0 +1,127 @@
+# this config is used to perform inference on SpeechLLM checkpoints
+name: megatron_audio_gpt_eval
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1
+  max_steps: 1000000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: True
+    save_best_model: False
+
+model:
+  from_pretrained: null  # pretrained model name on NGC or HF
+  restore_from_path: null # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  pretrained_audio_model: null  # Path to a .nemo model for audio encoder
+
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 1
+  micro_batch_size: 1
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: False # not used right now
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft: # keep these basic params for reusing in both sft and peft SpeechLMs
+    restore_from_path: null
+    restore_from_hparams_path: null
+    restore_from_ckpt:
+      checkpoint_name: null
+      checkpoint_dir: null
+
+
+  data:
+    test_ds:
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      end_string: ${data.train_ds.end_string}  # don't change, let hydra resolve from saved config
+      context_key: ${data.train_ds.context_key} # don't change, let hydra resolve from saved config
+      answer_key: ${data.train_ds.answer_key} # don't change, let hydra resolve from saved config
+      add_eos: ${data.train_ds.add_eos} # don't change, let hydra resolve from saved config
+      add_sep: ${data.train_ds.add_sep} # don't change, let hydra resolve from saved config
+      add_bos: ${data.train_ds.add_bos} # don't change, let hydra resolve from saved config
+      separate_prompt_and_response_with_newline: ${data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: True
+      output_file_path_prefix: "preds" # Prefix of the file to write predictions to.
+      truncation_field: ${data.train_ds.truncation_field}  # don't change, let hydra resolve from saved config
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${data.train_ds.prompt_template} # don't change, let hydra resolve from saved config
+      tokens_to_generate: 512
+      log_every_n_steps: 1
+      sample_rate: ${data.train_ds.sample_rate} # don't change, let hydra resolve from saved config
+      audio_locator: null # set it to allow multiple audios in a sample, e.g. '|audio|', and use it in the context field of manifest to specify the locations of audios (`audio_filepath` is a list of audios).
+
+      metric:
+        name: "bleu" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+save_as_nemo: null  # optional string, set to save the whole model into a single nemo file
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml
new file mode 100644
index 000000000000..172a8f37cf1c
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml
@@ -0,0 +1,327 @@
+name: megatron_audio_gpt_peft
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: ???
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    use_multi_layer_feat: false  # whether to extract multi-layer features, only supports conformer encoder
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml
new file mode 100644
index 000000000000..7f8512fbb19e
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml
@@ -0,0 +1,299 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_sft
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: ???
+  freeze_llm: True
+  freeze_audio_encoder: True
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  perception:
+    use_multi_layer_feat: false
+    multi_layer_feat:
+      layer_idx_list: [0,16]
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml
new file mode 100644
index 000000000000..656e7df287f1
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml
@@ -0,0 +1,307 @@
+name: megatron_audio_gpt_multi_enc_peft_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  freeze_llm: True
+  freeze_audio_encoder: True
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    modality_adapter: 
+      _target_: nemo.collections.multimodal.speech_llm.modules.PoolingMLPConnectors
+      hidden_dim: 512
+      pooling: 'cat'
+      pooling_factor: 2
+      num_layers: 4
+      input_dim: -1
+      output_dim: -1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    encoders:
+    # use `target` instead of `_target_` to avoid auto initialization by hydra, need to do manual instantiation
+      asr_model:
+        target: nemo.collections.asr.models.EncDecRNNTBPEModel
+        model_dim_key: d_model
+        freeze: True
+        pretrained_model: stt_en_fastconformer_transducer_large
+      ssl_model:
+        target: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
+        model_dim_key: d_model
+        freeze: True
+        pretrained_model: ssl_en_conformer_large
+        use_multi_layer_feat: True
+        multi_layer_feat:
+          layer_idx_list: [0,16]
+          aggregator:
+            mode: "cat"
+            pooling: "avg"
+            rounding: "floor"
+  
+    speaker_model:
+      segment_length_in_secs: 0.4
+      freeze: True
+      pretrained_model: titanet_large
+
+    ref_model: asr_model
+    aggregator:
+      mode: "cat"
+      pooling: "mean"
+      rounding: "floor"
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      # add_eos: True
+      add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "synced_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 20
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #  end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml
new file mode 100644
index 000000000000..cc848562f70e
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml
@@ -0,0 +1,317 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_salm_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  ## Legacy batch_size configuration
+  # When used with lhotse, the batch composition is decided by dataloader configs
+  # and batch size here is only used for deciding gradient accumulation.
+  # gradient accumulation = global_batch_size / micro_batch_size / data_parallel_size
+  # where data_parallel_size = num_nodes * num_gpus / TP_size
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{context}[/INST] {answer}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: null
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml b/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml
new file mode 100644
index 000000000000..a76de9e312e2
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml
@@ -0,0 +1,334 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_t5_salm_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+model_target: nemo.collections.multimodal.speech_llm.models.modular_t5_models.ModularizedAudioT5Model
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  virtual_prompt_style: 'no-prompts'  # make cls happy
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  language_model_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  lora_tuning:
+    kqv_adapter_dim: 128
+    kv_adapter_dim: 64
+    q_adapter_dim: 32
+    adapter_dropout: 0.0
+    column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+
+  peft:
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+        
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      add_sep: True
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      # sample_alpha: 0.1
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      max_open_streams: 50
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: [2.92,3.474,3.924,4.335,4.728,5.11,5.487,5.872,6.288,6.696,7.128,7.62,8.208,8.934,9.883,10.56,11.22,11.88,12.51,13.05,13.59,14.13,14.64,15.17875,15.81,16.54,17.37,18.241,19.18]
+      # sample_alpha: 0.1
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 1
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # make model init happy
+    num_workers: 0
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'input'
+    #   label_key: 'output'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/salm/salm_config.yaml b/examples/multimodal/speech_llm/conf/salm/salm_config.yaml
new file mode 100644
index 000000000000..c49e335c8d66
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/salm_config.yaml
@@ -0,0 +1,339 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: salm_fastconformer_gpt_lora_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 100
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    use_multi_layer_feat: false  # whether to extract multi-layer features, only supports conformer encoder
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      # sample_alpha: 0.1
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'input'
+    #   answer_key: 'output'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #  end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_eval.py b/examples/multimodal/speech_llm/modular_audio_gpt_eval.py
new file mode 100644
index 000000000000..d76e479829fa
--- /dev/null
+++ b/examples/multimodal/speech_llm/modular_audio_gpt_eval.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from pathlib import Path
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+mp.set_start_method("spawn", force=True)
+
+"""
+This is the script to run inference with a ModularAudioGPTModel.
+
+If you want to evaluate an ModularAudioGPTModel:
+
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ALM_DIR=/path/to/nemo_experiments/job_name
+ALM_YAML=$ALM_DIR/version_0/hparams.yaml
+ALM_CKPT="$ALM_DIR/checkpoints/AudioGPT--validation_wer\=0.5-step\=103-epoch\=0-last.ckpt"
+
+VAL_MANIFESTS="[/data/libri-test-other.json,/data/MCV_7.1_test.json,/data/wsj-test.json]"
+VAL_NAMES="[ls-test-other,mcv7.1-test,wsj-test]"
+
+HYDRA_FULL_ERROR=1 \
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.restore_from_path=$MEGATRON_CKPT \
+    model.peft.restore_from_path=$ALM_CKPT \
+    model.peft.restore_from_hparams_path=$ALM_YAML \
+    model.data.test_ds.manifest_filepath=$VAL_MANIFESTS \
+    model.data.test_ds.names=$VAL_NAMES \
+    model.data.test_ds.global_batch_size=8 \
+	model.data.test_ds.micro_batch_size=8 \
+	model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir=${ALM_DIR}
+"""
+
+
+@hydra_runner(config_path="conf", config_name="modular_audio_gpt_config_eval")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    logging.info("**************************************************\n\n")
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.from_pretrained:
+        # Load model from NGC or HuggingFace
+        logging.info(f"Loading model from cloud: {cfg.model.from_pretrained}")
+        model_cfg = ModularAudioGPTModel.from_pretrained(
+            cfg.model.from_pretrained, trainer=trainer, return_config=True
+        )
+        model_cfg = ModularAudioGPTModel.merge_inference_cfg(cfg, trainer, model_cfg)
+        model_file = ModularAudioGPTModel.from_pretrained(
+            cfg.model.from_pretrained, trainer=trainer, return_model_file=True
+        )
+        model = ModularAudioGPTModel.restore_from(
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+        if "peft" in model_cfg and model_cfg.peft.get("peft_scheme", None):
+            # need this due to the way that MegatronGPTSFTModel doesn't load adapters in model initialization
+            model.load_adapters(model_file, map_location="cpu")
+    else:
+        # Load model from a local file
+        model_cfg = ModularAudioGPTModel.merge_inference_cfg(cfg, trainer)
+        model = ModularAudioGPTModel.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+        model = ModularAudioGPTModel.load_adapters_for_inference(cfg, model_cfg, model)
+        model = ModularAudioGPTModel.load_audio_encoder_for_inference(cfg, model_cfg, model)
+
+    model.freeze()
+    if cfg.get("save_as_nemo", None):
+        model.setup("predict")  # need to call setup() to load adapters and prepare for saving
+        model.save_to(cfg.save_as_nemo)
+        logging.info(f"Model saved to {Path(cfg.save_as_nemo).absolute()}, exiting...")
+        exit(0)
+
+    if not cfg.model.get('use_flash_attention', False):
+        cfg.inference.compute_attention_mask = True
+    config = OmegaConf.to_container(cfg.inference, resolve=True)
+    model.set_inference_config(config)
+
+    # run inference
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_train.py b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
new file mode 100644
index 000000000000..ad8aacef2af2
--- /dev/null
+++ b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.core.config import hydra_runner
+from nemo.utils import logging, model_utils
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+"""
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ASR_MODEL=/path/to/asr-model.nemo
+
+TRAIN_MANIFESTS="[/data/train_1.json,/data/train_2.json]"
+VAL_MANIFESTS="[/data/dev_1.json,/data/dev_2.json]"
+VAL_NAMES="[dev-1,dev-2]"
+
+CUDA_VISIBLE_DEVICES="0,1" python modular_audio_gpt_train.py --config-path="./conf" --config-name "modular_audio_gpt_config_peft" \
+    trainer.devices=-1 \
+    model.freeze_audio_encoder=True \
+    model.freeze_llm=True \
+    model.global_batch_size=4 \
+    model.micro_batch_size=2 \
+    model.pretrained_audio_model=$ASR_MODEL \
+    model.restore_from_path=$MEGATRON_MODEL \
+    model.data.train_ds.manifest_filepath=$TRAIN_MANIFESTS \
+    model.data.validation_ds.manifest_filepath=$VAL_MANIFESTS \
+    ++model.data.validation_ds.names=$VAL_NAMES \
+"""
+
+
+@hydra_runner(config_path="conf", config_name="modular_audio_gpt_config_peft")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
+    with open_dict(cfg):
+        cfg.model.precision = cfg.trainer.precision
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    cfg.trainer.precision = precision
+
+    exp_manager(trainer, cfg.exp_manager)
+    # update resume from checkpoint found by exp_manager
+    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
+
+    if hasattr(cfg, 'model_target'):
+        imported_cls = model_utils.import_class_by_path(cfg.model_target)
+    else:
+        imported_cls = ModularAudioGPTModel
+    model = imported_cls.restore_from_pretrained_models(cfg, trainer=trainer)
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
index 8ce009d5458f..dff963590864 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
@@ -49,8 +49,8 @@ model:
   precision: ${trainer.precision}
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
-  micro_batch_size: 1 # limited by GPU memory
-  global_batch_size: 1 # will use more micro batches to reach global batch size
+  micro_batch_size: 16 # limited by GPU memory
+  global_batch_size: 16 # will use more micro batches to reach global batch size
   native_amp_init_scale: 65536.0 # Init scale for grad scaler used at fp16
 
 
@@ -97,15 +97,15 @@ model:
   unet_config:
     _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
     from_pretrained: #/ckpts/nemo-v1-2.ckpt
-    from_NeMo: True #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
+    from_NeMo: False #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
     image_size: 32 # unused
     in_channels: 4
     out_channels: 4
     model_channels: 320
     attention_resolutions:
-    - 4
-    - 2
-    - 1
+      - 4
+      - 2
+      - 1
     num_res_blocks: 2
     channel_mult:
     - 1
@@ -121,6 +121,7 @@ model:
     use_flash_attention: True
     unet_precision: fp32
     resblock_gn_groups: 32
+    use_te_fp8: False
 
   first_stage_config:
     _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
@@ -140,22 +141,22 @@ model:
       - 4
       - 4
       num_res_blocks: 2
-      attn_resolutions: []
+      attn_resolutions: [ ]
       dropout: 0.0
     lossconfig:
       target: torch.nn.Identity
 
   cond_stage_config:
-    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
-    restore_from_path: /ckpts/openai.nemo
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
     device: cuda
-    freeze: True
-    layer: "last"
-    #    For compatibility of history version that uses HF clip model
-    #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
-    #    version: openai/clip-vit-large-patch14
-    #    device: cuda
-    #    max_length: 77
+    max_length: 77
+  #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
+  #    restore_from_path: /ckpts/openai-old.nemo
+  #    device: cuda
+  #    freeze: True
+  #    layer: "last"
+
 
 
   # miscellaneous
@@ -163,7 +164,7 @@ model:
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  ddp_overlap: True # True for using PyTorch DDP overlap.
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam
@@ -191,7 +192,7 @@ model:
       synthetic_data_length: 10000
       train:
           dataset_path:
-            - /datasets/coyo/test.pkl
+            - /datasets/coyo/wdinfo/coyo-700m/wdinfo-selene.pkl
           augmentations:
             resize_smallest_side: 512
             center_crop_h_w: 512, 512
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
index 000416f7996b..ecb75953829e 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
@@ -2,7 +2,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 32
+  precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -17,7 +17,7 @@ trainer:
 
 
 infer:
-  num_samples: 1
+  num_samples: 4
   prompt:
     - "A professional photograph of an astronaut riding a pig"
     - 'A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat.'
@@ -59,25 +59,33 @@ model:
 
 quantize:
   exp_name: nemo_test
-  n_steps: 20
-  format: 'int8'
-  percentile: 1.0
-  batch_size: 1
-  calib_size: 32
-  quant_level: 2.5
-  alpha: 0.8
+  n_steps: 20          # number of inference steps
+  format: 'int8'       # only int8 quantization is supported now
+  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+  batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+  calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
   quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt
 
 onnx_export:
-  onnx_dir: nemo_onnx
-  pretrained_base: ${model.restore_from_path}
-  quantized_ckpt: ${quantize.quantized_ckpt}
+  onnx_dir: nemo_onnx  # Path to save onnx files
+  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl
+  quantized_ckpt: ${quantize.quantized_ckpt}  # Path to save quantized unet checkpoint
   format: int8
 
+trt_export:
+  static_batch: False # static batch engines have better latency
+  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch
+  max_batch_size: 8   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch
+  int8: True          # Allow engine builder recognize int8 precision
+  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time
+  trt_engine: int8_unet_xl.plan  # path to save trt engine
 
 use_refiner: False
 use_fp16: False # use fp16 model weights
 out_path: ./output
 run_quantization: True
 run_onnx_export: True
+run_trt_export: True
 
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
index f1e5e2872ea7..58e9e6e64470 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
@@ -28,6 +28,9 @@ def model_cfg_modifier(model_cfg):
         model_cfg.unet_config.use_flash_attention = False
         model_cfg.unet_config.from_pretrained = None
         model_cfg.first_stage_config.from_pretrained = None
+        model_cfg.first_stage_config._target_ = (
+            'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL'
+        )
 
     torch.backends.cuda.matmul.allow_tf32 = True
     trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference(
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
index 5c5e1dd94a09..89bfcd294ae4 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
@@ -34,6 +34,7 @@
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_pipeline import SamplingPipeline
 from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
 from nemo.core.config import hydra_runner
+from nemo.utils.trt_utils import build_engine
 
 
 def do_calibrate(base, calibration_prompts, **kwargs):
@@ -49,6 +50,26 @@ def do_calibrate(base, calibration_prompts, **kwargs):
         )
 
 
+def get_input_profile_unet(
+    batch_size, static_batch=False, min_batch_size=1, max_batch_size=8, latent_dim=32, adm_in_channels=1280
+):
+    assert batch_size >= min_batch_size and batch_size <= max_batch_size
+    if static_batch:
+        min_batch_size = batch_size if static_batch else min_batch_size
+        max_batch_size = batch_size if static_batch else max_batch_size
+    input_profile = {}
+    dummy_input = generate_dummy_inputs(
+        sd_version="nemo", device='cuda', latent_dim=latent_dim, adm_in_channels=adm_in_channels
+    )
+    for key, value in dummy_input.items():
+        input_profile[key] = [
+            (min_batch_size, *(value.shape[1:])),
+            (batch_size, *(value.shape[1:])),
+            (max_batch_size, *(value.shape[1:])),
+        ]
+    return input_profile
+
+
 @hydra_runner(config_path='conf', config_name='sd_xl_quantize')
 def main(cfg):
     def model_cfg_modifier(model_cfg):
@@ -147,6 +168,30 @@ def forward_loop():
             opset_version=opset_version,
         )
 
+    if cfg.run_trt_export:
+        torch.cuda.empty_cache()
+        batch_size = cfg.infer.get('num_samples', 1)
+        min_batch_size = cfg.trt_export.min_batch_size
+        max_batch_size = cfg.trt_export.max_batch_size
+        static_batch = cfg.trt_export.static_batch
+        fp16 = cfg.trainer.precision in ['16', '16-mixed', 16]
+        build_engine(
+            f"{cfg.onnx_export.onnx_dir}/unet.onnx",
+            f"{cfg.trt_export.trt_engine}",
+            fp16=fp16,
+            input_profile=get_input_profile_unet(
+                batch_size,
+                static_batch=static_batch,
+                min_batch_size=min_batch_size,
+                max_batch_size=max_batch_size,
+                latent_dim=cfg.sampling.base.height // 8,
+                adm_in_channels=base.model.model.diffusion_model.adm_in_channels,
+            ),
+            timing_cache=None,
+            int8=cfg.trt_export.int8,
+            builder_optimization_level=cfg.trt_export.builder_optimization_level,
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
index 04fc7bd91315..14c64a58a8af 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
@@ -22,6 +22,7 @@
 from cuda import cudart
 from transformers import CLIPTokenizer
 
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser
 from nemo.collections.multimodal.modules.stable_diffusion.encoders.modules import ConcatTimestepEmbedderND
 from nemo.collections.multimodal.modules.stable_diffusion.quantization_utils.trt_engine import TRT_LOGGER, Engine
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_helpers import perform_save_locally
@@ -176,6 +177,7 @@ def run(self, prompt, negative_prompt, image_height, image_width, num_samples, a
 
         with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
             torch.cuda.synchronize()
+            e2e_tic = time.perf_counter()
 
             c, uc = self.encode_prompt(prompt, negative_prompt)
 
@@ -198,8 +200,9 @@ def denoiser(input, sigma, c):
 
             samples_z = self.sampler(denoiser, randn, cond=c, uc=uc)
             samples_x = self.decode_images(samples_z)
+            e2e_tic = time.perf_counter() - e2e_tic
             samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-
+            print(f'This batch takes {e2e_tic}s')
             perform_save_locally(self.cfg.out_path, samples)
 
 
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml
new file mode 100644
index 000000000000..b37d64a325e5
--- /dev/null
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml
@@ -0,0 +1,204 @@
+# An example model that works with this config is "https://huggingface.co/yuvalkirstain/PickScore_v1"
+model:
+  precision: 32
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 32 # limited by GPU memory
+  global_batch_size: 32 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_pretrained: null # used in fine-tuning
+  # multimodal configs
+  output_dim: 1024
+  #  As the number of devices used to train increases, so does the space complexity of
+  #  the logit matrix. Using a naïve all-gather scheme, space complexity will be
+  #  `O(n^2)`. Instead, complexity may become effectively linear if the flags
+  #  `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
+  #  numerical results as the naïve method.
+  local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
+  gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
+
+  vision:
+    precision: 32
+    # vision configs
+    patch_dim: 14
+    img_h: 224
+    img_w: 224
+    image_mean: null
+    image_std: null
+    num_channels: 3
+    drop_patch_rate: 0.0
+    drop_path_rate: 0.0
+    global_average_pool: False
+    output_dim: ${model.output_dim}
+    class_token_length: 1
+    preprocess_layernorm: True # apply layer norm to embedded tokens
+
+    # model architecture
+    encoder_seq_length: 196
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_parameters
+    num_layers: 32
+    hidden_size: 1280
+    ffn_hidden_size: 5120 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: False
+    bias_activation_fusion: False
+    megatron_legacy: True
+    activation: gelu
+
+
+
+  text:
+    precision: 32
+    # text configs
+    output_dim: ${model.output_dim}
+
+    # model architecture
+    encoder_seq_length: 77
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_parameters
+    num_layers: 24
+    hidden_size: 1024
+    ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    num_micro_batches_with_partial_activation_checkpoints: null
+    activations_checkpoint_layers_per_pipeline: null
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: False
+    bias_activation_fusion: False
+    megatron_legacy: True
+
+    transformer_engine: False
+    fp8: False # enables fp8 in TransformerLayer forward
+    fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+    fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+    fp8_margin: 0 # scaling margin
+    fp8_interval: 1 # scaling update interval
+    fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+    fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+    use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+    activation: gelu
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'openai/clip-vit-large-patch14'
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+
+  data:
+    num_workers: 8
+    train:
+      dataset_path: # List of paths to pkl files or tar files
+        - /datasets/coyo/test.pkl
+    validation: # List of paths to pkl files or tar files
+      dataset_path:
+        - /datasets/coyo/test.pkl
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+    imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.2
+    betas:
+      - 0.9
+      - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 1e-5
\ No newline at end of file
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
index 778dc937efdc..1a81d21dd9a8 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -66,8 +66,14 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
+  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -119,8 +125,8 @@ model:
       query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds.
       doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: 1
-      micro_batch_size: 1
+      global_batch_size: ${global_batch_size}
+      micro_batch_size: ${micro_batch_size}
       shuffle: False
       num_workers: 0
       pin_memory: True
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
index efd5271884ed..6677dc2ed46c 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
@@ -4,15 +4,16 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
-  max_epochs: 9999
+  max_epochs: null
   max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
+  val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: null
+  num_sanity_val_steps: 0
 
 exp_manager:
   explicit_log_dir: null
@@ -34,7 +35,7 @@ exp_manager:
     model_parallel_size: ${model.tensor_model_parallel_size}
     always_save_nemo: False
     save_best_model: True
-  create_early_stopping_callback: True
+  create_early_stopping_callback: False
   early_stopping_callback_params:
     monitor: "val_loss"
     mode: "min"
@@ -54,7 +55,7 @@ model:
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
   sync_batch_comm: False
-  megatron_amp_O2: False
+  megatron_amp_O2: True 
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
@@ -62,8 +63,8 @@ model:
   sequence_parallel: False
 
   ## Activation Checkpoint
-  activations_checkpoint_granularity: null # 'selective' or 'full'
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
@@ -74,9 +75,15 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
   use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
+  apply_rope_fusion: False
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -135,7 +142,7 @@ model:
       num_workers: 0
       memmap_workers: 2
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: 512  # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency.
       min_seq_length: 1
       drop_last: True
       # Example of how to specify concat_sampling_probabilities
@@ -143,15 +150,16 @@ model:
       #   - 0.5
       #   - 0.25
       #   - 0.25
-      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      concat_sampling_probabilities: 
+        - 1.0 
       label_key: 'output'
       add_eos: True
       add_bos: False
       index_mapping_dir: null # Path to a directory to write index mapping files.
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
     validation_ds:
-      query_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      doc_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      query_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      doc_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
       global_batch_size: ${model.global_batch_size}
       micro_batch_size: ${model.micro_batch_size}
@@ -159,7 +167,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       label_key: ${model.data.train_ds.label_key}
@@ -182,7 +190,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       add_eos: ${model.data.train_ds.add_eos}
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ea37237f2eac..ca0c3f74e4c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -52,6 +52,7 @@ exp_manager:
     save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
     filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+    async_save: False # Set to True to enable async checkpoint save. Currently works only with distributed checkpoints
 
 model:
   # use GPTModel from megatron.core
@@ -150,8 +151,10 @@ model:
   fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
-  # PyTorch distributed checkpoint
-  torch_distributed_checkpoint: False # Set to True to use PyTorch distributed checkpoint format.
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
new file mode 100644
index 000000000000..1d3620493162
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
@@ -0,0 +1,172 @@
+name: megatron_griffin
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_griffin
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_griffin--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  # model architecture
+  encoder_seq_length: 512
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  logits_soft_cap: 30.0
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: [1.0, /path/to/data]
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single  # cyclic, LDDL
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
+    short_seq_prob: 0.1 # Probability of producing a short sequence.
+    ceil_to_power_2: True
+    get_attention_mask_from_fusion: True
+    pad_to_max_length: True
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
new file mode 100644
index 000000000000..f92f971eb059
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
@@ -0,0 +1,293 @@
+name: megatron_griffin
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: 
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  apply_rope_fusion: True
+  # model architecture
+  encoder_seq_length: 512
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  activation: 'fast-geglu'
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b'
+    model: null
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: null # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+    validation_ds:
+        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${model.global_batch_size}
+        micro_batch_size: ${model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${model.data.train_ds.label_key}
+        add_eos: ${model.data.train_ds.add_eos}
+        add_sep: ${model.data.train_ds.add_sep}
+        add_bos: ${model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
new file mode 100644
index 000000000000..e22b615d48aa
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
@@ -0,0 +1,295 @@
+name: megatron_griffin
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  apply_rope_fusion: True
+  # model architecture
+  encoder_seq_length: 512
+  logits_soft_cap: 30.0
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  activation: 'fast-geglu'
+
+  answer_only_loss: True
+
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b' 
+    model: null
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    test_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ??? # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "input" # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
similarity index 97%
rename from examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
rename to examples/nlp/language_modeling/conf/megatron_quantization.yaml
index 79a5bfbd8fe6..88d10ae0a66c 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
@@ -31,7 +31,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: 16 # Default precision data type
+  dtype: bf16 # Default precision data type
 
 model_file: llama2-7b-fp16.nemo # Nemo file path
 model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
diff --git a/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml b/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml
new file mode 100644
index 000000000000..e96ba0599bb3
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml
@@ -0,0 +1,227 @@
+name: megatron_qwen2
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 32768
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 40 # 4b: 40 | 7b: 32 | 14b: 40 | 72b: 80
+  hidden_size: 2560 # 4b: 2560 | 7b: 4096 | 14b: 5120 | 72b: 8192
+  ffn_hidden_size: 6912 # Transformer FFN hidden size. Usually 4 * hidden_size. | 4b: 6912 | 7b: 11008 | 14b: 13696 | 72b: 24576
+  num_attention_heads: 20 # 4b: 20 | 7b: 32 | 14b: 40 | 72b: 64
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  qkv_bias: True
+  activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 20 # Number of query groups for group query attention. If None, normal attention is used. | 4b: 20 | 7b: 32 | 14b: 40 | 72b: 64
+  override_vocab_size: 151936 # 4b: 151936 | 7b: 151936 | 14b: 152064 | 72b: 152064
+  rotary_base: 5000000.0 #  4b: 5000000.0 | 7b: 1000000.0 | 14b: 1000000.0 | 72b: 1000000.0
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-4B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: True
+    overlap_param_sync: True
+    contiguous_grad_buffer: True
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
diff --git a/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml b/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml
new file mode 100644
index 000000000000..e508b01858f5
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml
@@ -0,0 +1,39 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["</s>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 084a4b2642a2..f3413a5fa92e 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -148,7 +148,9 @@ def __init__(self, sentences):
         super().__init__()
         self.sentences = sentences
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
@@ -173,7 +175,9 @@ def main(cfg) -> None:
         callbacks.append(CustomProgressBar())
     # trainer required for restoring model parallel models
     trainer = Trainer(
-        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), **cfg.trainer, callbacks=callbacks,
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
     )
 
     if cfg.gpt_model_file is not None:
diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py b/examples/nlp/language_modeling/megatron_griffin_finetuning.py
similarity index 61%
rename from examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py
rename to examples/nlp/language_modeling/megatron_griffin_finetuning.py
index ad4624ee4233..c5ae513d5874 100644
--- a/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py
+++ b/examples/nlp/language_modeling/megatron_griffin_finetuning.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,50 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
+
 import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf
 
-from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_sft_model import MegatronGriffinSFTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.decorators import deprecated
 from nemo.utils.exp_manager import exp_manager
 
 mp.set_start_method("spawn", force=True)
 
-"""
-This is the script to finetuning a T5 Model with any PEFT method.
-A base T5 Model is required as a starting point. This script will then insert
-Adapters into each Transformer layer and will train/update only these adapters
-during training. The base T5 Model weights will remain frozen.
-
-This script is exactly the same as the peft tuning script for GPT. For more details
-please refer to the GPT script and docs.
-"""
-
-banner = '\n'.join(['' "*" * 80] * 5)
 
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_t5_peft_tuning.py is renamed to megatron_t5_finetuning.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_t5_finetuning_config")
+@hydra_runner(config_path="conf", config_name="megatron_griffin_finetuning_config")
 def main(cfg) -> None:
+
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
+    precision = cfg.trainer.precision
     trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
     exp_manager(trainer, cfg.exp_manager)
 
-    model_cfg = MegatronT5SFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronT5SFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+    model_cfg = MegatronGriffinSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronGriffinSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
     peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
 
     if cfg.model.peft.restore_from_path is not None:
diff --git a/examples/nlp/language_modeling/megatron_griffin_generate.py b/examples/nlp/language_modeling/megatron_griffin_generate.py
new file mode 100644
index 000000000000..c8e36668fced
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_griffin_generate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_sft_model import MegatronGriffinSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_griffin_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronGriffinSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronGriffinSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    model = MegatronGriffinSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index 03d6fd94e4e2..72252a03d5be 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -291,9 +291,9 @@ def load_from_checkpoint(
     **kwargs,
 ):
     """
-        Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
-        For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
-        """
+    Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
+    For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
+    """
     checkpoint = None
     try:
         cls._set_model_restore_state(is_being_restored=True)
@@ -470,7 +470,7 @@ def convert(local_rank, rank, world_size, args):
             )
         if mcore_output and not args.mcore_input:
             # convert from legacy Megatron-LM to MCore NeMo. Initialize an mcore translation dict
-            from scripts.nlp_language_modeling.convert_nemo_gpt_to_mcore import build_key_mapping
+            from scripts.checkpoint_converters.convert_gpt_nemo_to_mcore import build_key_mapping
 
             mcore_translate = {}
             for k, v in build_key_mapping(model_cfg).items():
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_quantization.py
similarity index 92%
rename from examples/nlp/language_modeling/megatron_llama_quantization.py
rename to examples/nlp/language_modeling/megatron_quantization.py
index 92ead6b4ed69..d4d6a8b6b917 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_quantization.py
@@ -25,12 +25,12 @@
 Nemo quantization example script.
 
 Please consult nemo.export.quantize.Quantizer class
-and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods,
+and examples/nlp/language_modeling/conf/megatron_quantization.yaml config on available quantization methods,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
 Example usage:
 ```
-python examples/nlp/language_modeling/megatron_llama_quantization.py \
+python examples/nlp/language_modeling/megatron_quantization.py \
     model_file=llama2-7b-fp16.nemo \
     model_save=llama2-7b-fp8.qnemo \
     quantization.algorithm=fp8 \
@@ -59,7 +59,7 @@ def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, ma
         yield batch
 
 
-@hydra_runner(config_path="conf", config_name="megatron_llama_quantization")
+@hydra_runner(config_path="conf", config_name="megatron_quantization")
 def main(cfg) -> None:
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for the inference.")
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 40347f317fbb..6517b62010b4 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -101,6 +101,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
+      variant: "nemo" # can be "nemo" or "canonical"
       target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       alpha: ${model.peft.lora_tuning.adapter_dim} 
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
index 67d43eb303f4..654fa785ec22 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
@@ -89,6 +89,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
+      variant: "nemo" # can be either "canonical" or "nemo"
       target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       adapter_dropout: 0.0
@@ -144,7 +145,7 @@ inference:
   top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
   temperature: 1.0 # sampling temperature
   all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  repetition_penalty: 1.0  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   outfile_path: output.txt
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
deleted file mode 100644
index 27e73996225f..000000000000
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-name: megatron_gpt_sft
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: 16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: 9999
-  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10 # frequency with which training steps are logged 
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
-    save_top_k: 2
-    mode: max
-    save_nemo_on_train_end: False 
-    filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    save_best_model: True
-
-model:
-  seed: 1234
-  tensor_model_parallel_size: 1 # intra-layer model parallelism
-  pipeline_model_parallel_size: 1 # inter-layer model parallelism
-  global_batch_size: 128
-  micro_batch_size: 4
-  restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
-  sync_batch_comm: False
-  megatron_amp_O2: False
-
-  ## Sequence Parallelism
-  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  sequence_parallel: False
-
-  ## Activation Checkpoint 
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null # not used with 'selective'
-  activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
-  answer_only_loss: False # not used right now
-  gradient_as_bucket_view: False
-  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
-  use_flash_attention: null # if not None, will match the base model's value
-
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  ffn_dropout: 0.0
-
-  data:
-    chat: False # whether use chatbot data or not
-    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
-      system_turn_start: '<extra_id_0>'
-      turn_start: '<extra_id_1>'
-      label_start: '<extra_id_2>'
-      end_of_turn: "\x0A"  # \0x0A is '\n'
-      end_of_name: "\x0A"  # \0x0A is '\n'
-    train_ds:
-      # Example of how to specify paths to multiple datasets
-      # file_names: 
-      #   - /path/to/squad.jsonl
-      #   - /path/to/mnli.jsonl
-      #   - /path/to/boolq.jsonl
-      # Example of how each dataset is formatted
-      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: True
-      num_workers: 4
-      memmap_workers: null
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: True
-      # Example of how to specify concat_sampling_probabilities
-      # concat_sampling_probabilities:
-      #   - 0.5
-      #   - 0.25
-      #   - 0.25
-      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
-      label_key: 'output'
-      add_eos: True
-      add_sep: False
-      add_bos: False
-      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-
-    validation_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 4
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_sep: ${model.data.train_ds.add_sep}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'rouge', 'token_f1']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-    test_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 4
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_sep: ${model.data.train_ds.add_sep}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: Options: ['left', 'right']
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-  optim:
-    name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
-    lr: 3e-5
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-
-inference:
-  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-  temperature: 1.0 # sampling temperature
-  all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  compute_attention_mask: True
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
deleted file mode 100644
index 11a375391e50..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-
-import asyncio
-import threading
-from functools import partial
-
-import torch
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
-from nemo.collections.nlp.modules.common.text_generation_utils import generate
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-except:
-    pass
-
-mp.set_start_method("spawn", force=True)
-"""
-This is the script to run inference with a PEFT model or an SFT Model.
-
-If you want to evaluate an SFT .nemo file:
-
-python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-	model.restore_from_path=<path_to_sft_nemo_file> \
-	model.peft.restore_from_path=null \
-	trainer.devices=1 model.data.test_ds.file_names=\[<path_to_test_jsonl_file1>, <path_to_test_jsonl_file2>] \
-	model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \  # this is not the filename just some identifier
-	model.data.test_ds.global_batch_size=4 \  # or some other value
-	model.data.test_ds.micro_batch_size=4 \
-	model.data.test_ds.tokens_to_generate=30 \
-	inference.greedy=True \
-	inference.outfile_path=\'<path_to_jsonl_output_file>'  
-
-If you want to evaluate a PEFT Model, you should provide a base GPT model and a PEFT model .nemo file
-
-python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-	model.restore_from_path=<path_to_sft_nemo_file> \
-	model.peft.restore_from_path=<path_to_peft_nemo_file> \ # this will be created if you use `megatron_gpt_finetuning.py`
-	trainer.devices=1 model.data.test_ds.file_names=\[<path_to_test_jsonl_file1>, <path_to_test_jsonl_file2>] \
-	model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \  # this is not the filename just some identifier
-	model.data.test_ds.global_batch_size=4 \  # or some other value
-	model.data.test_ds.micro_batch_size=4 \
-	model.data.test_ds.tokens_to_generate=30 \
-	inference.greedy=True \
-	inference.outfile_path=\'<path_to_jsonl_output_file>'  
-
-"""
-
-
-def use_inference_server(cfg, model, trainer):
-    if not HAVE_MEGATRON_CORE:
-        raise ValueError('Megatron-core needs to be installed to use this feature!')
-
-    from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo
-
-    trainer.test(model, dataloaders=None)
-
-    if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
-        if cfg.web_server:
-            if cfg.chat:
-                defaults = {
-                    'user': cfg.chatbot_config.user,
-                    'assistant': cfg.chatbot_config.assistant,
-                    'system': cfg.chatbot_config.system,
-                }
-                web_ui = partial(
-                    get_chatbot_demo,
-                    defaults=defaults,
-                    value=cfg.chatbot_config.value,
-                    attributes=cfg.chatbot_config.attributes,
-                )
-            else:
-                web_ui = get_demo
-            loop = asyncio.new_event_loop()
-            thread = threading.Thread(
-                target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop),
-            )
-            thread.start()
-        server = MegatronServer(model.cuda())
-        server.run("0.0.0.0", port=cfg.port)
-
-    while True:
-        choice = torch.cuda.LongTensor(1)
-        torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
-            generate(model.cuda())
-
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_gpt_peft_eval.py is renamed to megatron_gpt_generate.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_generate_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-
-    if cfg.model.peft.restore_from_path:
-        model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
-    else:
-        model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
-
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    if cfg.model.peft.restore_from_path:
-        model.load_adapters(cfg.model.peft.restore_from_path)
-
-    model.freeze()
-    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
-
-    if not cfg.model.get('use_flash_attention', False):
-        cfg.inference.compute_attention_mask = True
-    config = OmegaConf.to_container(cfg.inference, resolve=True)
-    model.set_inference_config(config)
-
-    if not cfg.server:
-        trainer.test(model)
-    else:
-        use_inference_server(cfg, model, trainer)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
deleted file mode 100644
index 1137866ccb8b..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-"""
-This is the script to finetuning a GPT Model with any PEFT method.
-A base GPT Model is required as a starting point. This script will then insert
-Adapters into each Transformer layer and will train/update only these adapters
-during training. The base GPT Model weights will remain frozen.
-
-During training this script will only save the newly trained Adapter weights
-in checkpoints. At the end of training a .nemo file of Adapter weights will 
-be saved.
-
-Usage:
-    Assuming the base model is a 125m GPT Model, with TP=1, PP=1:
-    a. run a training run for a base gpt nemo file:
-        python megatron_gpt_finetuning.py \
-            "model.data.train_ds.file_names=[PATH TO TRAINING JSONL FILE]",
-            "model.data.train_ds.concat_sampling_probabilities=[SAMPLING VAL]",
-            "model.data.validation_ds.file_names=[PATH TO VALIDATION JSONL FILE]",
-            "model.data.validation_ds.names=[NAME FOR METRIC LOGGING]",
-            model.restore_from_path="PATH TO BASE GPT MODEL .nemo FILE"
-            model.peft.peft_scheme='lora'  # lora, ptuning, adapter, ia3, or none for full fineutning
-            name="NAME OF TRAINING RUN"
-            exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE",
-Please see lora.ipynb for a step-by-step guide.
-"""
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_gpt_peft_tuning.py is renamed to megatron_gpt_finetuning.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_finetuning_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a checkpoint instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        model.add_adapter(peft_cfg_cls(model_cfg))
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
deleted file mode 100644
index 506ddd0364eb..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-import os
-import tempfile
-
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import get_prompt_template_example
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
-from nemo.collections.nlp.parts.nlp_overrides import (
-    CustomProgressBar,
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    NLPSaveRestoreConnector,
-    PipelineMixedPrecisionPlugin,
-)
-from nemo.core.config import hydra_runner
-from nemo.utils import AppState, logging
-from nemo.utils.decorators import deprecated
-from nemo.utils.exp_manager import exp_manager
-from nemo.utils.model_utils import inject_model_parallel_rank
-
-mp.set_start_method("spawn", force=True)
-
-
-def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
-    """
-    This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
-    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
-    """
-    OmegaConf.set_struct(gpt_cfg, True)
-    OmegaConf.resolve(cfg)
-    with open_dict(gpt_cfg):
-        gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-        gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-        gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-        gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
-        gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
-        gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
-        gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
-        gpt_cfg.activations_checkpoint_layers_per_pipeline = cfg.model.get(
-            "activations_checkpoint_layers_per_pipeline", None
-        )
-        gpt_cfg.data = cfg.model.data
-        gpt_cfg.optim = cfg.model.optim
-        gpt_cfg.precision = cfg.trainer.precision
-        gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
-        gpt_cfg.restore_from_path = cfg.model.restore_from_path
-        gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
-        gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
-        gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
-        gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
-        gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0)
-        gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
-        gpt_cfg.use_flash_attention = cfg.model.get('use_flash_attention', False)
-        gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1)
-        gpt_cfg.expert_model_parallel_size = cfg.model.get('expert_model_parallel_size', 1)
-        gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1)
-        gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0)
-
-        if cfg.model.data.get('chat', False):
-            # chat model, overwrite the prompt template
-            prompt_template = get_prompt_template_example(cfg.model.data.chat_prompt_tokens)
-            gpt_cfg.data.train_ds.prompt_template = prompt_template
-            gpt_cfg.data.validation_ds.prompt_template = prompt_template
-            gpt_cfg.data.test_ds.prompt_template = prompt_template
-
-        sft_cls = MegatronGPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        if cfg.model.get('use_flash_attention', None) is not None:
-            gpt_cfg.use_flash_attention = cfg.model.use_flash_attention
-
-        if cfg.model.get('seq_len_interpolation_factor', None) is not None:
-            gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor
-
-        if cfg.model.get('rotary_base', None) is not None:
-            gpt_cfg.rotary_base = cfg.model.rotary_base
-
-        sft_cls = MegatronGPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        # This is needed when modifying a hparam file directly to load `.ckpt` files.
-        # This is not needed to modify the cfg in `.nemo` files.
-        if add_cfg_to_tree:
-            OmegaConf.resolve(gpt_cfg)
-            gpt_cfg.cfg = gpt_cfg
-
-    return gpt_cfg
-
-
-def load_from_nemo(cls, cfg, trainer, gpt_cfg, modify_confg_fn):
-    gpt_cfg = modify_confg_fn(gpt_cfg, cfg, add_cfg_to_tree=False)
-    save_restore_connector = NLPSaveRestoreConnector()
-    if os.path.isdir(cfg.model.restore_from_path):
-        save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-    model = cls.restore_from(
-        restore_path=cfg.model.restore_from_path,
-        trainer=trainer,
-        override_config_path=gpt_cfg,
-        save_restore_connector=save_restore_connector,
-    )
-    return model
-
-
-def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
-    app_state = AppState()
-    if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
-        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
-        app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size
-        app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size
-        (
-            app_state.tensor_model_parallel_rank,
-            app_state.pipeline_model_parallel_rank,
-            app_state.model_parallel_size,
-            app_state.data_parallel_size,
-            app_state.pipeline_model_parallel_split_rank,
-            app_state.virtual_pipeline_model_parallel_rank,
-        ) = fake_initialize_model_parallel(
-            world_size=app_state.model_parallel_size,
-            rank=trainer.global_rank,
-            tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
-            pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
-            pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank,
-        )
-    checkpoint_path = inject_model_parallel_rank(
-        os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)
-    )
-    hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file)
-    gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True)
-    with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
-        OmegaConf.save(config=gpt_cfg, f=f.name)
-        model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,)
-        return model
-
-
-def validate_checkpoint_loading_args(cfg):
-    if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir):
-        raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.')
-    if cfg.checkpoint_name is None:
-        raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.')
-    if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file):
-        raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.')
-
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\n{__file__} is deprecated. PEFT and SFT scripts are now consolidated"
-    f"See updated scripts `megatron_gpt_finetuning.py` and `megatron_gpt_generate.py` for examples.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_sft")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-    with_distributed_adam = cfg.model.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True,
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-                hysteresis=cfg.model.get('hysteresis', 2),
-            )
-            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-        if megatron_amp_O2 and not with_distributed_adam:
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
-        # precision plugins and precision to exist
-        cfg.trainer.precision = None
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    callbacks = []
-    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
-        callbacks.append(CustomProgressBar())
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
-
-    exp_manager(trainer, cfg.exp_manager)
-
-    # update resume from checkpoint found by exp_manager
-    if cfg.model.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.model.resume_from_checkpoint
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
-
-    if cfg.model.restore_from_path:
-        save_restore_connector = NLPSaveRestoreConnector()
-        if os.path.isdir(cfg.model.restore_from_path):
-            save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-        gpt_cfg = MegatronGPTSFTModel.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            return_config=True,
-            save_restore_connector=save_restore_connector,
-        )
-        model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config)
-    else:
-        validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-        model = load_from_checkpoint_dir(MegatronGPTSFTModel, cfg, trainer, modify_confg_fn=_modify_config)
-
-    if 'inference' in cfg:
-        if not cfg.model.use_flash_attention:
-            cfg.inference.compute_attention_mask = True
-        config = OmegaConf.to_container(cfg.inference, resolve=True)
-        model.set_inference_config(config)
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_generate.py b/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
index d7328c5ca780..80c2e352aa5b 100644
--- a/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
+++ b/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
 
 import asyncio
 import threading
@@ -30,7 +27,6 @@
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.decorators import deprecated
 
 try:
     from megatron.core import parallel_state
@@ -111,14 +107,6 @@ def use_inference_server(cfg, model, trainer):
             generate(model.cuda())
 
 
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_t5_peft_eval.py is renamed to megatron_t5_generate.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
 @hydra_runner(config_path="conf", config_name="megatron_t5_generate_config")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
diff --git a/examples/nlp/rag/conf/rag_generating.yaml b/examples/nlp/rag/conf/rag_generating.yaml
new file mode 100644
index 000000000000..dcd86b1b220e
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_generating.yaml
@@ -0,0 +1,37 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
+
+generating:
+  llm:
+    model_type: gpt
+    model_path: null
+  query: null
+  inference:
+    greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+    top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    temperature: 1.0 # sampling temperature
+    add_BOS: True # add the bos token at the begining of the prompt
+    tokens_to_generate: 500 # The minimum length of the sequence to be generated.
+    all_probs: False  # whether return the log prob for all the tokens in vocab
+    repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+    min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+    compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
\ No newline at end of file
diff --git a/examples/nlp/rag/conf/rag_indexing.yaml b/examples/nlp/rag/conf/rag_indexing.yaml
new file mode 100644
index 000000000000..049afc1dbbfe
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_indexing.yaml
@@ -0,0 +1,19 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
\ No newline at end of file
diff --git a/examples/nlp/rag/images/rag_pipeline.png b/examples/nlp/rag/images/rag_pipeline.png
new file mode 100644
index 000000000000..810ef254e857
Binary files /dev/null and b/examples/nlp/rag/images/rag_pipeline.png differ
diff --git a/examples/nlp/rag/rag.md b/examples/nlp/rag/rag.md
new file mode 100644
index 000000000000..b7de0ef46fd2
--- /dev/null
+++ b/examples/nlp/rag/rag.md
@@ -0,0 +1,141 @@
+RAG with NeMo
+================
+
+Retrieval-augmented generation (RAG) is a technique for enhancing the accuracy and reliability of generative AI models with facts fetched from external sources. With NeMo, we can employ a text embedder and an LLM trained with NeMo Framework to set up a RAG pipeline.
+This document illustrates how NeMo models can be used with LlamaIndex, a popular RAG library, for a retrieval-based text generation application.
+
+## Quick Start
+
+In this example, we set up a pipeline that lets us index a document file (e.g., a manual, repository documentation) then ask questions and details in the document.
+
+The only dependency in this example is LlamaIndex, which can be installed with:
+```
+!pip install llama-index
+```
+
+A general RAG pipeline includes an Indexing step, in which the corpus document(s) are processed, embedded and indexed, and Generating step, in which given a query, the relevant neighbors text chunks are retrieved from the index to provide context to the query and fed into the LLM to generate answers. Below we walk through these two steps of the pipeline.
+
+<p align="center">
+        <img src="images/rag_pipeline.png" width="1000" >
+</p>
+
+### Indexing data
+
+
+The first step is processing and indexing the corpus document(s). To do so, set the path to the embedder checkpoint, corpus document(s), index saving directory and relevant arguments, then run the following command. Below we explain in more details the steps run within the script.
+
+
+```
+python examples/nlp/rag/rag_indexing.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.embedder.embed_batch_size=128 \
+        indexing.data.data_path='/path/to/data' \
+        indexing.data.chunk_size=256 \
+        indexing.data.chunk_overlap=10 \
+        indexing.index_path='/path/to/index'
+```
+
+Inside the script, the following steps are run.
+
+First, the document is read into LlamaIndex's `SimpleDirectoryReader` object.
+
+```
+print("Loading documents.")
+documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+```
+
+We then set up how the corpus document(s) will be split into smaller chunks, by setting splitter type, chunk size, and chunk overlap values.
+
+```
+print("Setting text transformation.")
+Settings.text_splitter = SentenceSplitter()
+Settings.chunk_size = cfg.indexing.data.chunk_size
+Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+```
+
+We then load the trained embedder NeMo model. Currently, this script only supports `.nemo` checkpoints. The wrapper around NeMo embedder to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_embedder.py`. We can try different embedding batch size to balance the number of samples embedded at once and embedding speed.
+
+```
+print("Loading embedding models.")
+model_path = cfg.indexing.embedder.model_path
+embed_batch_size = cfg.indexing.embedder.embed_batch_size
+embed_model = NeMoEmbeddings(model_path = model_path, cfg = cfg, embed_batch_size = embed_batch_size)
+Settings.embed_model = embed_model
+```
+
+Next, we will index the corpus document(s), simply by using the LlamaIndex `VectorStoreIndex.from_documents()` method. Under the hood, this method will split the corpus document(s) into smaller chunks having a pre-defined chunk size, batch them and feed them to the embedder, then put the output embeddings into an index. In this example, we use the built-in LlamaIndex's in-memory vector store to save the index. We can also use external vector stores, such as Milvus, Qdrant, etc. See more at [LlamaIndex Vector Stores](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/).          
+
+
+```
+print("Indexing data.")
+index = VectorStoreIndex.from_documents(documents, show_progress=True)
+```
+
+After indexing, we save the index to disk that later we can load to be used with an LLM.
+
+```
+print("Saving index to disk.")
+index_path = cfg.indexing.index_path
+index.storage_context.persist(persist_dir=index_path)
+```
+
+
+###  Generation
+
+After processing and indexing the document, we can have a NeMo LLM model to interact with the corpus document(s) through RAG, such as asking details within the documents. To do so, set the path to the LLM checkpoint, save index, and a query to ask and run the following command. Below we explain in more details the steps run within the script.
+
+```
+python examples/nlp/rag/rag_eval.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.index_path='/path/to/index' \
+        generating.llm.model_path='/path/to/checkpoints/llm_model.nemo' \
+        generating.inference.greedy=False \
+        generating.inference.temperature=1.0 \
+        generating.query='Which art schools did I applied to?'
+```
+
+Inside the script, the following steps are run.
+
+
+First, the LLM is loaded from `generating.llm.model_path`. Currently the script only works with `.nemo` checkpoints. The wrapper around NeMo LLM to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_llm.py`. 
+
+```
+print("Loading LLM.")
+model_path = cfg.generating.llm.model_path
+Settings.llm = NeMoLLM(model_path = model_path, cfg = cfg)
+```
+
+Then we load the index saved on disk in the previous indexing step. If using Milvus database, it can also be loaded at this step.
+```
+print("Loading index from disk.")
+index_path = cfg.indexing.index_path
+storage_context = StorageContext.from_defaults(persist_dir=index_path)
+index = load_index_from_storage(storage_context)
+```
+
+Finally, we will retrieve the relevant contexts and generate answers for the query using LlamaIndex's `query_engine.query()` method. Under the hood, this method automatically embeds the query with the defined embedder, then retrieve the k relevant contexts from the index, and add those contexts to a predefined template along with the query before feeding them to the LLM for generation. We can set the number of relevant contexts to be retrieved by setting the argument `similarity_top_k` value.
+```
+print("Responding to query using relevant contexts.")
+query_engine = index.as_query_engine(similarity_top_k=3)
+response = query_engine.query(query)
+print(response)
+```
+
+Below is an example of the default template by LlamaIndex to feed a query and relevant contexts to the LLM. This template can be modified following LlamaIndex's documentation [Prompts RAG](https://docs.llamaindex.ai/en/stable/examples/prompts/prompts_rag/).
+
+
+```
+Context information is below.
+---------------------
+{context_str 1}
+{context_str 2}
+...
+---------------------
+Given the context information and not prior knowledge, answer the query.
+Query: {query_str}
+Answer:
+```
\ No newline at end of file
diff --git a/examples/nlp/rag/rag_generating.py b/examples/nlp/rag/rag_generating.py
new file mode 100644
index 000000000000..952dc2532102
--- /dev/null
+++ b/examples/nlp/rag/rag_generating.py
@@ -0,0 +1,49 @@
+from llama_index.core import Settings, StorageContext, load_index_from_storage
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_generating")
+def main(cfg) -> None:
+
+    # load LLM
+    logging.info("Loading LLM.")
+    model_path = cfg.generating.llm.model_path
+    if cfg.generating.llm.model_type == "gpt":
+        Settings.llm = NeMoGPTLLM(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.generating.model_type in ["gpt"], "Currently RAG pipeline supports 'gpt' for LLM models."
+
+    # load embedder
+    logging.info("Loading embedder.")
+    model_path = cfg.indexing.embedder.model_path
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # load index from disk
+    logging.info("Loading index from disk.")
+    index_path = cfg.indexing.index_path
+    storage_context = StorageContext.from_defaults(persist_dir=index_path)
+    index = load_index_from_storage(storage_context)
+
+    # set query
+    logging.info("Setting query.")
+    query = cfg.generating.query
+    logging.info("Query: ", query)
+
+    # query and print response
+    logging.info("Responding to query using relevant contexts.")
+    query_engine = index.as_query_engine(similarity_top_k=3)
+    response = query_engine.query(query)
+    logging.info(response)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/rag/rag_indexing.py b/examples/nlp/rag/rag_indexing.py
new file mode 100644
index 000000000000..ab487c035228
--- /dev/null
+++ b/examples/nlp/rag/rag_indexing.py
@@ -0,0 +1,44 @@
+from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_indexing")
+def main(cfg) -> None:
+
+    # load data
+    logging.info("Loading documents.")
+    documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+
+    # set text transformation
+    logging.info("Setting text transformation.")
+    Settings.text_splitter = SentenceSplitter()
+    Settings.chunk_size = cfg.indexing.data.chunk_size
+    Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+
+    # load embedder
+    logging.info("Loading embedding models.")
+    model_path = cfg.indexing.embedder.model_path
+    embed_batch_size = cfg.indexing.embedder.embed_batch_size
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg, embed_batch_size=embed_batch_size)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # index data
+    logging.info("Indexing data.")
+    index = VectorStoreIndex.from_documents(documents, show_progress=True)
+
+    # save index data to disk
+    logging.info("Saving index to disk.")
+    index_path = cfg.indexing.index_path
+    index.storage_context.persist(persist_dir=index_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tts/audio_codec.py b/examples/tts/audio_codec.py
index 800edfb7fb0f..5fc4b6fd0afd 100644
--- a/examples/tts/audio_codec.py
+++ b/examples/tts/audio_codec.py
@@ -27,6 +27,7 @@ def main(cfg):
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     model = AudioCodecModel(cfg=cfg.model, trainer=trainer)
+    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
     trainer.fit(model)
 
 
diff --git a/examples/tts/conf/audio_codec/audio_codec_16000.yaml b/examples/tts/conf/audio_codec/audio_codec_16000.yaml
index 7182414a31db..93b44b579655 100644
--- a/examples/tts/conf/audio_codec/audio_codec_16000.yaml
+++ b/examples/tts/conf/audio_codec/audio_codec_16000.yaml
@@ -92,13 +92,13 @@ model:
     log_epochs: [1, 2, 3, 4, 5, 6]
     epoch_frequency: 1
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -129,8 +129,6 @@ model:
     _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
     resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
diff --git a/examples/tts/conf/audio_codec/audio_codec_24000.yaml b/examples/tts/conf/audio_codec/audio_codec_24000.yaml
index e5e386722fb1..cf48db807d25 100644
--- a/examples/tts/conf/audio_codec/audio_codec_24000.yaml
+++ b/examples/tts/conf/audio_codec/audio_codec_24000.yaml
@@ -2,7 +2,7 @@
 # If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-name: EnCodec
+name: AudioCodec
 
 max_epochs: ???
 # Adjust batch size based on GPU memory
@@ -90,13 +90,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -127,8 +127,6 @@ model:
     _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
     resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
@@ -162,7 +160,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/examples/tts/conf/audio_codec/encodec_24000.yaml b/examples/tts/conf/audio_codec/encodec_24000.yaml
index 4898d449d520..be66fd4b4979 100644
--- a/examples/tts/conf/audio_codec/encodec_24000.yaml
+++ b/examples/tts/conf/audio_codec/encodec_24000.yaml
@@ -90,13 +90,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -162,7 +162,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/examples/tts/conf/audio_codec/mel_codec_22050.yaml b/examples/tts/conf/audio_codec/mel_codec_22050.yaml
new file mode 100644
index 000000000000..df77e7747a51
--- /dev/null
+++ b/examples/tts/conf/audio_codec/mel_codec_22050.yaml
@@ -0,0 +1,194 @@
+# This config contains the default values for training 22.05kHz audio codec model which encodes mel spectrogram
+# instead of raw audio.
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+
+name: MelCodec
+
+max_epochs: ???
+# Adjust batch size based on GPU memory
+batch_size: 16
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+weighted_sampling_steps_per_epoch: null
+
+# Dataset metadata for each manifest
+# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
+train_ds_meta: ???
+val_ds_meta: ???
+
+log_ds_meta: ???
+log_dir: ???
+
+# Modify these values based on your sample rate
+sample_rate: 22050
+win_length: 1024
+hop_length: 256
+train_n_samples: 8192 # ~0.37 seconds
+# The product of the up_sample_rates should match the hop_length.
+# For example 8 * 8 * 2 * 2 = 256.
+up_sample_rates: [8, 8, 2, 2]
+
+
+model:
+
+  max_epochs: ${max_epochs}
+  steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+
+  sample_rate: ${sample_rate}
+  samples_per_frame: ${hop_length}
+
+  mel_loss_l1_scale: 1.0
+  mel_loss_l2_scale: 0.0
+  stft_loss_scale: 20.0
+  time_domain_loss_scale: 0.0
+  commit_loss_scale: 0.0
+
+  # Probability of updating the discriminator during each training step
+  # For example, update the discriminator 1/2 times (1 update for every 2 batches)
+  disc_updates_per_period: 1
+  disc_update_period: 2
+
+  # All resolutions for mel reconstruction loss, ordered [num_fft, hop_length, window_length]
+  loss_resolutions: [
+    [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]
+  ]
+  mel_loss_dims: [5, 10, 20, 40, 80, 160, 320]
+  mel_loss_log_guard: 1.0
+  stft_loss_log_guard: 1.0
+  feature_loss_type: absolute
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+      sample_rate: ${sample_rate}
+      n_samples: ${train_n_samples}
+      min_duration: 0.4
+      max_duration: null
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      drop_last: true
+      num_workers: 4
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss
+      dataset_meta: ${val_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  # Configures how audio samples are generated and saved during training.
+  # Remove this section to disable logging.
+  log_config:
+    log_dir: ${log_dir}
+    log_epochs: [10, 50, 100, 150, 200]
+    epoch_frequency: 100
+    log_tensorboard: false
+    log_wandb: false
+
+    generators:
+      - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
+        log_audio: true
+        log_encoding: false
+        log_dequantized: false
+
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only log the first 10 seconds of generated audio.
+      dataset_meta: ${log_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  audio_encoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.MultiBandMelEncoder
+    mel_bands: [[0, 10], [10, 20], [20, 30], [30, 40], [40, 50], [50, 60], [60, 70], [70, 80]]
+    out_channels: 4 # The dimension of each codebook
+    hidden_channels: 128
+    filters: 256
+    mel_processor:
+      _target_: nemo.collections.tts.modules.audio_codec_modules.MelSpectrogramProcessor
+      mel_dim: 80
+      sample_rate: ${sample_rate}
+      win_length: ${win_length}
+      hop_length: ${hop_length}
+
+  audio_decoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder
+    up_sample_rates: ${up_sample_rates}
+    input_dim: 32 # Should be equal to len(audio_encoder.mel_bands) * audio_encoder.out_channels
+    base_channels: 1024 # This is double the base channels of HiFi-GAN V1, making it approximately 4x larger.
+
+  vector_quantizer:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer
+    num_groups: 8 # Should equal len(audio_encoder.mel_bands)
+    num_levels_per_group: [8, 5, 5, 5] # 8 * 5 * 5 * 5 = 1000 entries per codebook
+
+  discriminator:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator
+    discriminators:
+      - _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
+        resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
+      - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator
+
+  generator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
+
+  discriminator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss
+
+  optim:
+    _target_: torch.optim.Adam
+    lr: 2e-4
+    betas: [0.8, 0.99]
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: 16
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 5
+  benchmark: false
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: false
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+  create_checkpoint_callback: true 
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
diff --git a/examples/tts/conf/audio_codec/mel_codec_44100.yaml b/examples/tts/conf/audio_codec/mel_codec_44100.yaml
index 15d12f009ae0..3ae528df6a64 100644
--- a/examples/tts/conf/audio_codec/mel_codec_44100.yaml
+++ b/examples/tts/conf/audio_codec/mel_codec_44100.yaml
@@ -94,13 +94,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -146,8 +146,6 @@ model:
         resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
       - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
@@ -181,7 +179,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/examples/tts/radtts.py b/examples/tts/radtts.py
index 7dbdaedced03..09bf69a2d6e5 100644
--- a/examples/tts/radtts.py
+++ b/examples/tts/radtts.py
@@ -68,7 +68,7 @@ def main(cfg):
     lr_logger = pl.callbacks.LearningRateMonitor()
     epoch_time_logger = LogEpochTimeCallback()
     trainer.callbacks.extend([lr_logger, epoch_time_logger])
-    trainer.fit(model.cuda())
+    trainer.fit(model)
 
 
 if __name__ == '__main__':
diff --git a/nemo/collections/asr/data/audio_to_audio.py b/nemo/collections/asr/data/audio_to_audio.py
index a3c6dd0cc1b3..4f4727239a4b 100644
--- a/nemo/collections/asr/data/audio_to_audio.py
+++ b/nemo/collections/asr/data/audio_to_audio.py
@@ -130,13 +130,19 @@ class ASRAudioProcessor:
         sample_rate: sample rate used for all audio signals
         random_offset: If `True`, offset will be randomized when loading a subsegment
                        from a file.
+        normalization_signal: Normalize all audio with a factor that ensures the signal
+                    `example[normalization_signal]` in `process` is in range [-1, 1].
+                    All other audio signals are scaled by the same factor. Default is
+                    `None`, corresponding to no normalization.
     """
 
     def __init__(
-        self, sample_rate: float, random_offset: bool,
+        self, sample_rate: float, random_offset: bool, normalization_signal: Optional[str] = None, eps: float = 1e-8,
     ):
         self.sample_rate = sample_rate
         self.random_offset = random_offset
+        self.normalization_signal = normalization_signal
+        self.eps = eps
 
         self.sync_setup = None
         self.async_setup = None
@@ -314,7 +320,20 @@ def process_audio(self, audio: Dict[str, torch.Tensor]) -> Dict[str, torch.Tenso
         Returns:
             An ordered dictionary of signals and their tensors.
         """
-        # Currently, not doing any processing of the loaded signals.
+        if self.normalization_signal:
+            # Normalize all audio with a factor that ensures the normalization signal is in range [-1, 1].
+            norm_scale = audio[self.normalization_signal].abs().max()
+
+            # Do not normalize embeddings
+            skip_signals = self.embedding_setup.signals if self.embedding_setup is not None else []
+
+            # Normalize audio signals
+            for signal in audio:
+                if signal not in skip_signals:
+                    # All audio signals are scaled by the same factor.
+                    # This ensures that the relative level between signals is preserved.
+                    audio[signal] = audio[signal] / (norm_scale + self.eps)
+
         return audio
 
     def load_sync_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
@@ -812,6 +831,9 @@ class AudioToTargetDataset(BaseAudioDataset):
                                 If `None`, all channels will be loaded.
         target_channel_selector: Optional, select subset of channels from each input audio file.
                                  If `None`, all channels will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal'.
     """
 
     def __init__(
@@ -827,6 +849,7 @@ def __init__(
         max_utts: Optional[int] = None,
         input_channel_selector: Optional[int] = None,
         target_channel_selector: Optional[int] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -841,7 +864,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
         audio_processor.sync_setup = SignalSetup(
             signals=['input_signal', 'target_signal'],
             duration=audio_duration,
@@ -932,6 +957,9 @@ class AudioToTargetWithReferenceDataset(BaseAudioDataset):
                                    from input and target.
         reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`,
                             complete audio file will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal', 'reference_signal'.
     """
 
     def __init__(
@@ -951,6 +979,7 @@ def __init__(
         reference_channel_selector: Optional[int] = None,
         reference_is_synchronized: bool = True,
         reference_duration: Optional[float] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -966,7 +995,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
 
         if reference_is_synchronized:
             audio_processor.sync_setup = SignalSetup(
@@ -1063,6 +1094,9 @@ class AudioToTargetWithEmbeddingDataset(BaseAudioDataset):
                                 If `None`, all channels will be loaded.
         target_channel_selector: Optional, select subset of channels from each input audio file.
                                  If `None`, all channels will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal'.
     """
 
     def __init__(
@@ -1079,6 +1113,7 @@ def __init__(
         max_utts: Optional[int] = None,
         input_channel_selector: Optional[int] = None,
         target_channel_selector: Optional[int] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -1094,7 +1129,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
         audio_processor.sync_setup = SignalSetup(
             signals=['input_signal', 'target_signal'],
             duration=audio_duration,
diff --git a/nemo/collections/asr/data/audio_to_audio_dataset.py b/nemo/collections/asr/data/audio_to_audio_dataset.py
index b296d64b1f2a..46e47020fda0 100644
--- a/nemo/collections/asr/data/audio_to_audio_dataset.py
+++ b/nemo/collections/asr/data/audio_to_audio_dataset.py
@@ -36,6 +36,7 @@ def get_audio_to_target_dataset(config: dict) -> audio_to_audio.AudioToTargetDat
         max_utts=config.get('max_utts', 0),
         input_channel_selector=config.get('input_channel_selector', None),
         target_channel_selector=config.get('target_channel_selector', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
 
@@ -65,6 +66,7 @@ def get_audio_to_target_with_reference_dataset(config: dict) -> audio_to_audio.A
         reference_channel_selector=config.get('reference_channel_selector', None),
         reference_is_synchronized=config.get('reference_is_synchronized', True),
         reference_duration=config.get('reference_duration', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
 
@@ -91,5 +93,6 @@ def get_audio_to_target_with_embedding_dataset(config: dict) -> audio_to_audio.A
         max_utts=config.get('max_utts', 0),
         input_channel_selector=config.get('input_channel_selector', None),
         target_channel_selector=config.get('target_channel_selector', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index 00c15109b64f..e0bb63ad18cd 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -75,7 +75,9 @@ def _speech_collate_fn(batch, pad_id):
     has_audio = audio_lengths[0] is not None
     if has_audio:
         max_audio_len = max(audio_lengths).item()
-    max_tokens_len = max(tokens_lengths).item()
+    has_tokens = tokens_lengths[0] is not None
+    if has_tokens:
+        max_tokens_len = max(tokens_lengths).item()
 
     audio_signal, tokens = [], []
     for b in batch:
@@ -89,19 +91,24 @@ def _speech_collate_fn(batch, pad_id):
                 pad = (0, max_audio_len - sig_len)
                 sig = torch.nn.functional.pad(sig, pad)
             audio_signal.append(sig)
-        tokens_i_len = tokens_i_len.item()
-        if tokens_i_len < max_tokens_len:
-            pad = (0, max_tokens_len - tokens_i_len)
-            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
-        tokens.append(tokens_i)
+        if has_tokens:
+            tokens_i_len = tokens_i_len.item()
+            if tokens_i_len < max_tokens_len:
+                pad = (0, max_tokens_len - tokens_i_len)
+                tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+            tokens.append(tokens_i)
 
     if has_audio:
         audio_signal = torch.stack(audio_signal)
         audio_lengths = torch.stack(audio_lengths)
     else:
         audio_signal, audio_lengths = None, None
-    tokens = torch.stack(tokens)
-    tokens_lengths = torch.stack(tokens_lengths)
+    if has_tokens:
+        tokens = torch.stack(tokens)
+        tokens_lengths = torch.stack(tokens_lengths)
+    else:
+        tokens = None
+        tokens_lengths = None
     if sample_ids is None:
         return audio_signal, audio_lengths, tokens, tokens_lengths
     else:
@@ -256,8 +263,7 @@ def cache_datastore_manifests(
     if num_datastore_manifests > 0:
         # Local utility function
         def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
-            """Cache manifests and audio data from object store.
-            """
+            """Cache manifests and audio data from object store."""
             # Determine the number of workers to use
             if num_workers is None:
                 num_workers = os.cpu_count() - 1
@@ -421,8 +427,7 @@ class _AudioTextDataset(Dataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -546,8 +551,7 @@ class AudioToCharDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -640,8 +644,7 @@ class AudioToBPEDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -910,8 +913,7 @@ def __next__(self):
         return TarredAudioFilter(self.manifest_processor.collection)
 
     def _loop_offsets(self, iterator):
-        """This function is used to iterate through utterances with different offsets for each file.
-        """
+        """This function is used to iterate through utterances with different offsets for each file."""
 
         class TarredAudioLoopOffsets:
             def __init__(self, collection):
@@ -944,8 +946,7 @@ def _collate_fn(self, batch):
         return _speech_collate_fn(batch, self.pad_id)
 
     def _build_sample(self, tup):
-        """Builds the training sample by combining the data from the WebDataset with the manifest info.
-        """
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
         audio_bytes, audio_filename, offset_id = tup
 
         # Grab manifest entry from self.manifest_preprocessor.collection
@@ -1316,7 +1317,9 @@ class BucketingDataset(IterableDataset):
     """
 
     def __init__(
-        self, dataset: IterableDataset, bucketing_batch_size: int,
+        self,
+        dataset: IterableDataset,
+        bucketing_batch_size: int,
     ):
         self.wrapped_dataset = dataset
         self.bucketing_batch_size = bucketing_batch_size
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 000b1a8f0839..e9e97d3d32d7 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from typing import Callable, Sequence
 
-import omegaconf
 import torch.utils.data
 from lhotse import CutSet
 from lhotse.cut import MixedCut, MonoCut
@@ -21,7 +20,9 @@
 from lhotse.dataset.collation import collate_vectors
 
 from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
 from nemo.collections.common.tokenizers import CanaryTokenizer, TokenizerSpec
+from nemo.collections.common.tokenizers.canary_tokenizer import CANARY_SPECIAL_TOKENIZER
 
 
 class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
@@ -57,21 +58,21 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        tokens, prompt_tokens = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
+        prompts_with_answers, prompts = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
 
-        tokens = [torch.as_tensor(t) for t in tokens]
-        token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
-        tokens = collate_vectors(tokens, padding_value=self.padding_value)
+        prompts_with_answers = [torch.as_tensor(t) for t in prompts_with_answers]
+        prompts_with_answers_lens = torch.tensor([t.size(0) for t in prompts_with_answers], dtype=torch.long)
+        prompts_with_answers = collate_vectors(prompts_with_answers, padding_value=self.padding_value)
 
         if self.inference:
-            prompt_tokens = [torch.as_tensor(t) for t in prompt_tokens]
-            prompt_token_lens = torch.tensor([t.size(0) for t in prompt_tokens], dtype=torch.long)
-            prompt_tokens = collate_vectors(prompt_tokens, padding_value=self.padding_value)
+            prompts = [torch.as_tensor(t) for t in prompts]
+            prompts_lens = torch.tensor([t.size(0) for t in prompts], dtype=torch.long)
+            prompts = collate_vectors(prompts, padding_value=self.padding_value)
         else:
-            prompt_tokens = None
-            prompt_token_lens = None
+            prompts = None
+            prompts_lens = None
 
-        return audio, audio_lens, tokens, token_lens, prompt_tokens, prompt_token_lens
+        return audio, audio_lens, prompts_with_answers, prompts_with_answers_lens, prompts, prompts_lens
 
 
 # Mapping from a string name to a known prompt formatter function.
@@ -105,7 +106,9 @@ def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool]
 
 
 @registered_prompt_format_fn
-def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -> Sequence[Sequence[int]]:
+def canary(
+    cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     """
     Prepend and append control tokens to the token sequence as per Canary format.
 
@@ -132,116 +135,53 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -
     assert isinstance(
         tokenizer._tokenizer, CanaryTokenizer
     ), "To use 'canary' prompt format, you must use the CanaryTokenizer."
-    tokenizer = tokenizer._tokenizer
+    formatter = CanaryPromptFormatter(tokenizer._tokenizer)
 
-    tokens, prompts = [], []
+    prompts_with_answers, prompts = [], []
     for cut in cuts:
         if isinstance(cut, MixedCut):
             cut = cut._first_non_padding_cut
-        assert isinstance(cut, MonoCut), "Expected MonoCut."
+        if not isinstance(cut, MonoCut):
+            raise TypeError(
+                f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})"
+            )
 
         # first, validate the utterance
-        missing_keys = [k for k in ("source_lang", "target_lang", "taskname", "pnc") if k not in cut.custom]
+        expected_slots = set(formatter.get_slots("user"))
+        missing_keys = expected_slots - set(cut.custom)
+        if "task" in missing_keys and "taskname" in cut.custom:
+            # Compatibility with "old" Canary manifest format.
+            # For compatbility with inference options, this slot is now called "task".
+            cut.custom["task"] = cut.custom["taskname"]
+            missing_keys.remove("task")
         if missing_keys:
             raise RuntimeError(
                 f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}"
                 f"Please ensure that every utterance in the input manifests contains these keys."
             )
 
-        # Actual tokenization. If a cut has multiple supervisions, we'll stitch their tokenized texts together.
-        texts = [sup.text for sup in cut.supervisions]
-        langs = [sup.language for sup in cut.supervisions]
-        taskname = cut.custom['taskname']
-        pnc = cut.custom['pnc']
-        source_lang = cut.custom['source_lang']
-        target_lang = cut.custom['target_lang']
-
-        tokens.append(canary_prompt(tokenizer, texts, langs, source_lang, target_lang, taskname, pnc))
-        if inference:
-            prompts.append(canary_prompt(tokenizer, None, None, source_lang, target_lang, taskname, pnc))
-    return tokens, prompts
-
-
-def canary_prompt(
-    tokenizer: CanaryTokenizer,
-    text: str | list[str] | None,
-    language: str | list[str] | None,
-    source_language: str,
-    target_language: str,
-    taskname: str,
-    pnc: str,
-) -> list[int]:
-    if isinstance(text, str):
-        text = [text]
-    if isinstance(language, str):
-        language = [language]
-
-    if text is not None:
-        try:
-            tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[])
-        except omegaconf.errors.KeyValidationError as e:
-            raise ProbablyIncorrectLanguageKeyError(
-                "We couldn't select the right tokenizer, which could be due to issues with reading "
-                "the language from the manifest. "
-                "If you're training, try setting lang_field='' to a different value (probably 'target_lang' or 'lang'). "
-                "If you're using model.transcribe() directly, please use override_config kwarg to set this. "
-                "If you're using transcribe_speech.py, use option gt_lang_attr_name='...' "
-            ) from e
-    else:
-        tokens = None  # create prompt for inference
-
-    # bos
-    prompted_tokens = [tokenizer.bos_id]
-
-    if tokens is not None and len(tokens) == 0:
-        # no speech token
-        prompted_tokens.append(tokenizer.nospeech_id)
-    else:
-        # first, validate the utterance
-        if source_language is None or target_language is None or taskname is None or pnc is None:
-            raise RuntimeError(
-                f"Missing keys provided to prompt: "
-                f"source_langauge={source_language},\n"
-                f"target_language={target_language},\n"
-                f"taskname={taskname},\n"
-                f"pnc={pnc}\n"
-                f"Please ensure that every utterance in the input manifests contains these keys."
-            )
-
-        # src_lang_id/no_speech
-        src_lang_id = tokenizer.spl_token_to_id(source_language)
-        prompted_tokens.append(src_lang_id)
-
-        # task
-        task = taskname
-        if task == 'asr' or task == "transcribe":
-            prompted_tokens.append(tokenizer.spl_token_to_id("transcribe"))
-        elif task == 's2t_translation' or task == 'ast' or task == "translate":
-            prompted_tokens.append(tokenizer.spl_token_to_id("translate"))
-        else:
-            raise ValueError(f"Unknown task: {task}")
-
-        # tgt_lang_id
-        tgt_lang_id = tokenizer.spl_token_to_id(target_language)
-        prompted_tokens.append(tgt_lang_id)
-
-        # PnC
-        pnc = f"{pnc}".lower().strip()  # to account for bool or str
-        if pnc in {'yes', 'true'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("pnc"))
-        elif pnc in {'no', 'false'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("nopnc"))
-        else:
-            raise ValueError(f"Unknown value for key 'pnc': {pnc}")
-
-        # text (only in training)
-        if tokens is not None:
-            prompted_tokens.extend(tokens)
+        encoded = formatter.encode_dialog(
+            turns=[
+                dict(
+                    role="user",
+                    slots={
+                        **{slot: cut.custom[slot] for slot in expected_slots},
+                        formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
+                    },
+                ),
+                dict(
+                    role="assistant",
+                    slots={
+                        "text": ' '.join(s.text for s in cut.supervisions),
+                        formatter.PROMPT_LANGUAGE_SLOT: cut.custom["target_lang"],
+                    },
+                ),
+            ]
+        )
+        prompts_with_answers.append(encoded["input_ids"])
+        prompts.append(encoded["context_ids"])
 
-    # eos (only in training)
-    if tokens is not None:
-        prompted_tokens.append(tokenizer.eos_id)
-    return prompted_tokens
+    return prompts_with_answers, prompts
 
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
diff --git a/nemo/collections/asr/losses/__init__.py b/nemo/collections/asr/losses/__init__.py
index 3e50cea1d692..c03f7a48ffe3 100644
--- a/nemo/collections/asr/losses/__init__.py
+++ b/nemo/collections/asr/losses/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss
-from nemo.collections.asr.losses.audio_losses import SDRLoss
+from nemo.collections.asr.losses.audio_losses import MSELoss, SDRLoss
 from nemo.collections.asr.losses.ctc import CTCLoss
 from nemo.collections.asr.losses.lattice_losses import LatticeLoss
 from nemo.collections.asr.losses.ssl_losses.contrastive import ContrastiveLoss
diff --git a/nemo/collections/asr/losses/audio_losses.py b/nemo/collections/asr/losses/audio_losses.py
index 62ce4a9f7edd..b0214375a713 100644
--- a/nemo/collections/asr/losses/audio_losses.py
+++ b/nemo/collections/asr/losses/audio_losses.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -21,31 +21,33 @@
 from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like
 from nemo.collections.asr.parts.utils.audio_utils import toeplitz
 from nemo.core.classes import Loss, Typing, typecheck
-from nemo.core.neural_types import AudioSignal, LengthsType, LossType, MaskType, NeuralType
+from nemo.core.neural_types import AudioSignal, LengthsType, LossType, MaskType, NeuralType, VoidType
 from nemo.utils import logging
 
-__all__ = ['SDRLoss']
+__all__ = ['SDRLoss', 'MSELoss']
 
 
-def temporal_mean(
+def calculate_mean(
     input: torch.Tensor,
     input_length: Optional[torch.Tensor] = None,
     mask: Optional[torch.Tensor] = None,
+    dim: Union[int, Tuple[int]] = -1,
     keepdim: bool = False,
     eps: float = 1e-10,
 ) -> torch.Tensor:
-    """Calculate mean along temporal dimension with optionally
+    """Calculate mean along dimension `dim` with optionally
     averaging only over valid samples (based on the input length).
 
     Args:
-        input: Batch of signals, shape (B, C, T)
+        input: signal, for example (B, C, T) or (B, C, D, T)
         input_length: Optional, length of each example in the batch, shape (B,)
-        mask: Optional, temporal mask for each example in the batch, shape (B, T)
+        mask: Optional, temporal mask for each example in the batch, same shape as the input signal
+        dim: dimension or dimensions to reduce
         keepdim: Whether to keep the temporal dimension
         eps: Regularization to avoid division by zero
 
     Returns:
-        (B, C, 1) if keepdim=True, otherwise (B, C)
+        Mean over dimensions `dim`.
     """
     if input_length is not None:
         if mask is not None:
@@ -53,17 +55,18 @@ def temporal_mean(
                 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.'
             )
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=input, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=input, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(input)
 
     if mask is None:
         # No length information, assume all samples are valid
-        mean = torch.mean(input, dim=-1, keepdim=keepdim)
+        mean = torch.mean(input, dim=dim, keepdim=keepdim)
     else:
         # Average using temporal mask
-        mean = mask.unsqueeze(1) * input
-        mean = torch.sum(mean, axis=-1, keepdim=keepdim)
-        normalization = torch.sum(mask, axis=-1, keepdim=keepdim)
-        mean = mean / (normalization.unsqueeze(1) + eps)
+        mean = mask * input
+        mean = torch.sum(mean, dim=dim, keepdim=keepdim)
+        normalization = torch.sum(mask, dim=dim, keepdim=keepdim)
+        mean = mean / (normalization + eps)
 
     return mean
 
@@ -101,16 +104,17 @@ def scale_invariant_target(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
-    estimate_dot_target = temporal_mean(estimate * target, mask=mask, keepdim=True, eps=eps)
-    target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, keepdim=True, eps=eps)
+    estimate_dot_target = calculate_mean(estimate * target, mask=mask, dim=-1, keepdim=True, eps=eps)
+    target_pow = calculate_mean(torch.abs(target) ** 2, mask=mask, dim=-1, keepdim=True, eps=eps)
     scale = estimate_dot_target / (target_pow + eps)
     target_scaled = scale * target
 
     # Mask to keep only the valid samples
     if mask is not None:
-        target_scaled = mask.unsqueeze(1) * target_scaled
+        target_scaled = mask * target_scaled
 
     return target_scaled
 
@@ -162,12 +166,13 @@ def convolution_invariant_target(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
     # Apply a mask, if available
     if mask is not None:
-        estimate = mask.unsqueeze(1) * estimate
-        target = mask.unsqueeze(1) * target
+        estimate = mask * estimate
+        target = mask * target
 
     # Calculate filtered target
     input_shape = estimate.shape
@@ -207,7 +212,7 @@ def convolution_invariant_target(
 
     # Mask to keep only the valid samples
     if mask is not None:
-        target_filt = mask.unsqueeze(1) * target_filt
+        target_filt = mask * target_filt
 
     return target_filt
 
@@ -261,11 +266,12 @@ def calculate_sdr_batch(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
     if remove_mean:
-        estimate = estimate - temporal_mean(estimate, mask=mask, keepdim=True, eps=eps)
-        target = target - temporal_mean(target, mask=mask, keepdim=True, eps=eps)
+        estimate = estimate - calculate_mean(estimate, mask=mask, dim=-1, keepdim=True, eps=eps)
+        target = target - calculate_mean(target, mask=mask, dim=-1, keepdim=True, eps=eps)
 
     if scale_invariant or (convolution_invariant and convolution_filter_length == 1):
         target = scale_invariant_target(estimate=estimate, target=target, mask=mask, eps=eps)
@@ -276,8 +282,8 @@ def calculate_sdr_batch(
 
     distortion = estimate - target
 
-    target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, eps=eps)
-    distortion_pow = temporal_mean(torch.abs(distortion) ** 2, mask=mask, eps=eps)
+    target_pow = calculate_mean(torch.abs(target) ** 2, mask=mask, dim=-1, eps=eps)
+    distortion_pow = calculate_mean(torch.abs(distortion) ** 2, mask=mask, dim=-1, eps=eps)
 
     if sdr_max is not None:
         distortion_pow = distortion_pow + 10 ** (-sdr_max / 10) * target_pow
@@ -353,7 +359,7 @@ def input_types(self):
             "estimate": NeuralType(signal_shape, AudioSignal()),
             "target": NeuralType(signal_shape, AudioSignal()),
             "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
-            "mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+            "mask": NeuralType(('B', 'C', 'T'), MaskType(), optional=True),
         }
 
     @property
@@ -376,10 +382,10 @@ def forward(
         perform averaging across channels (weighting optional), and apply reduction across the batch.
 
         Args:
-            estimate: Batch of signals, shape (B, T, C)
-            target: Batch of signals, shape (B, T, C)
+            estimate: Batch of signals, shape (B, C, T)
+            target: Batch of signals, shape (B, C, T)
             input_length: Batch of lengths, shape (B,)
-            mask: Batch of temporal masks, shape (B, T)
+            mask: Batch of temporal masks for each channel, shape (B, C, T)
 
         Returns:
             Scalar loss.
@@ -410,3 +416,161 @@ def forward(
         sdr = self.reduce(sdr)
 
         return -sdr
+
+
+def calculate_mse_batch(
+    estimate: torch.Tensor,
+    target: torch.Tensor,
+    input_length: Optional[torch.Tensor] = None,
+    mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Calculate MSE per channel.
+
+        MSE = ||estimate - target||_2^2 / input_length
+
+    Args:
+        estimate: estimated signal, shape (B, C, T) or (B, C, D, T)
+        target: target signal, shape (B, C, T) or (B, C, D, T)
+        input_length: Optional, length of valid samples, shape (B,)
+        mask: Optional, temporal mask, same shape as signals
+
+    Returns:
+        MSE for each channel, shape (B, C)
+    """
+    assert (
+        estimate.shape == target.shape
+    ), f'Estimate shape ({estimate.shape}) not matching target shape ({target.shape})'
+
+    if input_length is not None:
+        if mask is not None:
+            raise RuntimeError(
+                'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.'
+            )
+
+        # Construct a binary mask
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
+
+    # error
+    err = estimate - target
+
+    # dimensions for averaging
+    if estimate.ndim == 3:
+        # average across time
+        dim = -1
+    elif estimate.ndim == 4:
+        # average across time and features
+        dim = (-2, -1)
+    else:
+        raise RuntimeError(f'Unexpected dimension of the input: {estimate.shape}')
+
+    # calculate masked mean
+    mse = calculate_mean(torch.abs(err) ** 2, mask=mask, dim=dim)
+
+    return mse
+
+
+class MSELoss(Loss, Typing):
+    """
+    Computes MSE loss with weighted average across channels.
+
+    Args:
+        weight: weight for loss of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
+        reduction: batch reduction. Defaults to `mean` over the batch.
+        ndim: Number of dimensions for the input signal
+    """
+
+    def __init__(
+        self, weight: Optional[List[float]] = None, reduction: str = 'mean', ndim: int = 3,
+    ):
+        super().__init__()
+
+        # weight buffer
+        if weight is not None:
+            if any([w <= 0 for w in weight]):
+                raise ValueError(f'Weight must be positive! Current value: {weight}')
+            elif not np.isclose(sum(weight), 1, atol=1e-6):
+                raise ValueError(f'Weight should add to one, current weight: {weight}')
+            weight = torch.tensor(weight).reshape(1, -1)
+            logging.info(f'Channel weight set to %s', weight)
+        self.register_buffer('weight', weight)
+        self.weight: Optional[Tensor]
+
+        # Batch reduction
+        self.reduction = reduction
+        if reduction == 'mean':
+            self.reduce = torch.mean
+        else:
+            raise ValueError(f'Unexpected reduction mode {reduction}.')
+
+        # Input dimension
+        self.ndim = ndim
+
+        if self.ndim == 3:
+            # Time-domain input
+            self.signal_shape = ('B', 'C', 'T')
+        elif self.ndim == 4:
+            # Spectral-domain input
+            self.signal_shape = ('B', 'C', 'D', 'T')
+        else:
+            raise ValueError(f'Unexpected input dimension: {self.ndim}')
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tweight:       %s', self.weight)
+        logging.debug('\treduction:    %s', self.reduction)
+        logging.debug('\tndim:         %s', self.ndim)
+        logging.debug('\tsignal_shape: %s', self.signal_shape)
+
+    @property
+    def input_types(self):
+        """Input types definitions for SDRLoss.
+        """
+        return {
+            "estimate": NeuralType(self.signal_shape, VoidType()),
+            "target": NeuralType(self.signal_shape, VoidType()),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "mask": NeuralType(self.signal_shape, MaskType(), optional=True),
+        }
+
+    @property
+    def output_types(self):
+        """Output types definitions for SDRLoss.
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(elements_type=LossType())}
+
+    @typecheck()
+    def forward(
+        self,
+        estimate: torch.Tensor,
+        target: torch.Tensor,
+        input_length: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """For input batch of multi-channel signals, calculate SDR between estimate and target for each channel,
+        perform averaging across channels (weighting optional), and apply reduction across the batch.
+
+        Args:
+            estimate: Estimate of the target signal
+            target: Target signal
+            input_length: Length of each example in the batch
+            mask: Mask for each signal
+
+        Returns:
+            Scalar loss.
+        """
+        mse = calculate_mse_batch(estimate=estimate, target=target, input_length=input_length, mask=mask,)
+
+        # channel averaging
+        if self.weight is None:
+            mse = torch.mean(mse, dim=1)
+        else:
+            # weighting across channels
+            mse = mse * self.weight
+            mse = torch.sum(mse, dim=1)
+
+        # reduction
+        mse = self.reduce(mse)
+
+        return mse
diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py
index bab691913c0a..16a70925ac9b 100644
--- a/nemo/collections/asr/losses/ssl_losses/contrastive.py
+++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import ceil
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -25,8 +27,7 @@
 class ContrastiveLoss(Loss):
     @property
     def input_types(self):
-        """Input types definitions for Contrastive.
-        """
+        """Input types definitions for Contrastive."""
         return {
             "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
             "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
@@ -147,13 +148,17 @@ def sample_negatives(self, y, num):
 
     @typecheck()
     def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
-        spec_in = spectrograms.transpose(-2, -1)
+        targets = spectrograms.transpose(-2, -1)
         masks = spec_masks.transpose(-2, -1)
-        targets = spec_in
         # BxTxC
+        diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]
+
+        if diff > 0:
+            targets = F.pad(targets, (0, 0, 0, diff))
+            masks = F.pad(masks, (0, 0, 0, diff))
 
-        targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
-        masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
+        targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
+        masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
 
         if self.quantized_targets:
             if self.store_ids:
@@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             if self.sample_from_non_masked:
                 # sample from all steps in utterance
                 negatives, _ = self.sample_negatives(
-                    targets.transpose(0, 1), targets_masked_only.size(0),  # TxBxC  # T'
+                    targets.transpose(0, 1),
+                    targets_masked_only.size(0),  # TxBxC  # T'
                 )
             else:
                 # only sample from masked steps in utterance
@@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             elif self.sample_from_non_masked:
                 # sample from all steps in batch
                 negatives, _ = self.sample_negatives(
-                    targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0),  # BTxC
+                    targets.reshape(targets.shape[0] * targets.shape[1], -1),
+                    targets_masked_only.size(0),  # BTxC
                 )  # T'
             else:
                 # only sample from masked steps
diff --git a/nemo/collections/asr/metrics/audio.py b/nemo/collections/asr/metrics/audio.py
index 5e8c2915e3fa..db63ac19c098 100644
--- a/nemo/collections/asr/metrics/audio.py
+++ b/nemo/collections/asr/metrics/audio.py
@@ -57,6 +57,7 @@ class AudioMetricWrapper(Metric):
     """
 
     full_state_update: bool = False
+    num_examples: torch.Tensor
 
     def __init__(
         self, metric: Metric, channel: Optional[int] = None, metric_using_batch_averaging: Optional[bool] = None
@@ -74,6 +75,7 @@ def __init__(
 
         self._metric = metric
         self._channel = channel
+        self.add_state('num_examples', default=torch.tensor(0), dist_reduce_fx='sum')
         logging.debug('Setup metric %s, channel %s', metric, str(channel))
 
     def _select_channel(self, preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -144,6 +146,8 @@ def update(self, preds: torch.Tensor, target: torch.Tensor, input_length: Option
             for b_preds, b_target in self._trim_inputs(preds=preds, target=target, input_length=input_length):
                 self._metric.update(preds=b_preds, target=b_target)
 
+        self.num_examples += preds.size(0)
+
     def compute(self) -> torch.Tensor:
         """Compute the underlying metric.
         """
@@ -179,6 +183,9 @@ def forward(
     def reset(self) -> None:
         """Reset the underlying metric.
         """
+        # reset the internal states
+        super().reset()
+        # reset the underlying metric
         self._metric.reset()
 
     def __repr__(self) -> str:
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index 019c57f9c4e3..23c759afc80d 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -23,7 +23,11 @@
 from nemo.collections.asr.models.clustering_diarizer import ClusteringDiarizer
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
-from nemo.collections.asr.models.enhancement_models import EncMaskDecAudioToAudioModel
+from nemo.collections.asr.models.enhancement_models import (
+    EncMaskDecAudioToAudioModel,
+    PredictiveAudioToAudioModel,
+    ScoreBasedGenerativeAudioToAudioModel,
+)
 from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel
 from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
 from nemo.collections.asr.models.k2_sequence_models import (
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 7e20d7a16559..edb591921782 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import warnings
 from dataclasses import dataclass, field
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
@@ -21,6 +22,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
     PromptedAudioToTextLhotseDataset,
@@ -44,6 +46,7 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
+from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -99,10 +102,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     Configuration for Multi Task Transcription
     """
 
-    task: Optional[str] = None
-    pnc: Optional[bool] = None
-    source_lang: Optional[str] = None
-    target_lang: Optional[str] = None
+    prompt: list[dict[str, dict[str, str]]] | None = None
     text_field: str = "answer"
     lang_field: str = "target_lang"
 
@@ -111,10 +111,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     )
 
     def __post_init__(self):
-        required_fields = ['task', 'pnc', 'source_lang', 'target_lang', 'text_field', 'lang_field']
-        for field in required_fields:
-            if not hasattr(self, field):
-                raise ValueError(f"`{field}` must be present in the transcription config: {self}")
+        self.prompt = parse_multitask_prompt(self.prompt)
 
 
 class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin):
@@ -133,6 +130,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         super().__init__(cfg=cfg, trainer=trainer)
 
+        prompt_cls = PromptFormatter.resolve(self.prompt_format)
+        self.prompt = prompt_cls(
+            tokenizer=self.tokenizer,
+            defaults=OmegaConf.to_container(pd) if (pd := cfg.get("prompt_defaults")) is not None else None,
+        )
+
         # Setup audio preprocessor
         self.preprocessor = EncDecMultiTaskModel.from_config_dict(self.cfg.preprocessor)
         # Setup audio encoder
@@ -156,7 +159,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.transf_encoder = EncDecMultiTaskModel.from_config_dict(transf_encoder_cfg_dict)
 
             # Initialize weights
-            std_init_range = 1 / self.cfg.model_defaults.lm_enc_hidden ** 0.5
+            std_init_range = 1 / self.cfg.model_defaults.lm_enc_hidden**0.5
             self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range))
 
         transf_decoder_cfg_dict = cfg.transf_decoder
@@ -182,7 +185,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
 
         # Initialize weights
-        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden ** 0.5
+        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden**0.5
         self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
@@ -347,7 +350,7 @@ def change_vocabulary(
             self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
 
         # Initialize weights of token classifier
-        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden ** 0.5
+        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden**0.5
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
         # Setup Decoding class
@@ -387,38 +390,33 @@ def change_vocabulary(
     @torch.no_grad()
     def transcribe(
         self,
-        audio: Union[List[str], str],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
-        task: Optional[str] = None,
-        pnc: Optional[bool] = None,
-        source_lang: Optional[str] = None,
-        target_lang: Optional[str] = None,
         num_workers: int = 0,
         channel_selector: Optional[ChannelSelectorType] = None,
         augmentor: DictConfig = None,
         verbose: bool = True,
         override_config: Optional[MultiTaskTranscriptionConfig] = None,
+        **prompt,
     ) -> Union[List[str], List[Hypothesis]]:
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
                 Bigger will result in better throughput performance but would use more memory.
             return_hypotheses: (bool) Either return hypotheses or text
                 With hypotheses can do some postprocessing like getting timestamp or rescoring
-            task: (str) task name. Defaults to `asr`.
-            pnc: (bool) whether to apply punctuation and capitalization or not. Defaults to True.
-            source_lang: (str) source language. Defaults to `en`.
-            target_lang: (str) target language. Defaults to `en`.
             num_workers: (int) number of workers for DataLoader
             channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
             augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
             verbose: (bool) whether to display tqdm progress bar
             override_config: (Optional[MultiTaskTranscriptionConfig]) A config to override the default config.
+            **prompt: Optional input to construct the prompts for the model. Accepted formats are: 1) legacy Canary-1B API source_lang=<lang>, target_lang=<lang>, etc. 2) explicit single-turn role=<role>, slots={<slot>: <value>, ...} 3) explicit multi-turn: turns=[{"role": <role>, "slots": {<slot>: <value>, ...}}]
 
         Returns:
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
@@ -431,10 +429,7 @@ def transcribe(
                 channel_selector=channel_selector,
                 augmentor=augmentor,
                 verbose=verbose,
-                task=task,
-                pnc=pnc,
-                source_lang=source_lang,
-                target_lang=target_lang,
+                prompt=prompt,
             )
         else:
             if not isinstance(override_config, MultiTaskTranscriptionConfig):
@@ -736,9 +731,6 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig):
                 if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'manifest_path'):
                     trcfg._internal.manifest_filepath = manifest_path
 
-        elif isinstance(audio, (np.ndarray, torch.Tensor)):
-            raise NotImplementedError("Transcribing from numpy or torch tensors is not supported yet.")
-
     def _transcribe_input_manifest_processing(
         self, audio_files: List[str], temp_dir: str, trcfg: MultiTaskTranscriptionConfig
     ) -> Dict[str, Any]:
@@ -790,7 +782,47 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
         log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch[0], input_signal_length=batch[1]
         )
-        decoder_input_ids = batch[-2].to(trcfg._internal.device)
+        if len(batch) == 6:
+            # Prompt provided by the dataloader.
+            decoder_input_ids = batch[4]
+        else:
+            # The dataloader provided only audio + audio_lens, so we
+            # are constructing the prompt dynamically using TranscribeConfig.
+
+            # Now ask the prompt formatter about which slots are required.
+            # It will return a default prompt structure with default slot values (if available, None otherwise).
+            # We iterate over that structure and update slot values based on ``trcfg.prompt``.
+            default_turns = self.prompt.get_default_dialog_slots()
+            if not trcfg.prompt:
+                # No turns were provided, use defaults.
+                turns = default_turns
+            else:
+                # Turns were provided, iterate over them and fill missing slot values using defaults..
+                turns = trcfg.prompt.copy()  # shallow copy #1: don't override the config
+                for turn in turns:
+                    role = turn["role"]
+                    # Check if we have defaults for this role.
+                    # There shouldn't be more than a single turn for a given role, but if there are,
+                    # we'll emit a warning.
+                    if default_turns_for_role := [t for t in default_turns if t["role"] == role]:
+                        if len(default_turns_for_role) > 1:
+                            warnings.warn(
+                                f"More than one default turn detected for {role=}. "
+                                f"We'll be using default slot values for the first turn of {role=} only."
+                            )
+                        default_slots = default_turns_for_role[0]["slots"]
+                        turn["slots"] = turn["slots"].copy()  # shallow copy #1: don't override the config
+                        # fill missing slots using defaults
+                        for slot, val in default_slots.items():
+                            if turn["slots"].get(slot) is None:
+                                turn["slots"][slot] = val
+
+            decoder_input_ids = (
+                self.prompt.encode_dialog(turns=turns)["context_ids"]
+                .unsqueeze(0)
+                .repeat(batch[0].shape[0], 1)
+                .to(trcfg._internal.device)
+            )
         output = dict(
             log_probs=log_probs,
             encoded_lengths=encoded_len,
@@ -875,6 +907,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'drop_last': False,
             'text_field': config.get('text_field', 'answer'),
             'lang_field': config.get('lang_field', 'target_lang'),
+            'channel_selector': config.get('channel_selector', None),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config), inference=True)
@@ -903,6 +936,8 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
         Returns:
             A list of dictionaries with the audio file paths fixed.
         """
+        # This method is a legacy helper for Canary that checks whether prompt slot values were provided
+        # in the input manifest and if not, it injects the defaults.
         out_json_items = []
         for item in json_items:
             if isinstance(item, str):
@@ -910,28 +945,21 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                 entry = {
                     'audio_filepath': item,
                     'duration': 100000,
-                    'source_lang': 'en' if trcfg.source_lang is None else trcfg.source_lang,
-                    'taskname': 'asr' if trcfg.task is None else trcfg.task,
-                    'target_lang': 'en' if trcfg.target_lang is None else trcfg.target_lang,
-                    'pnc': 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no',
                     trcfg.text_field: 'nothing',
                 }
             elif isinstance(item, dict):
                 entry = item
                 entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path)
-
-                if 'source_lang' not in entry:
-                    entry['source_lang'] = 'en' if trcfg.source_lang is None else trcfg.source_lang
-                if 'taskname' not in entry:
-                    entry['taskname'] = 'asr' if trcfg.task is None else trcfg.task
-                if 'target_lang' not in entry:
-                    entry['target_lang'] = 'en' if trcfg.target_lang is None else trcfg.target_lang
-                if 'pnc' not in entry:
-                    entry['pnc'] = 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no'
                 if trcfg.text_field not in entry:
                     entry[trcfg.text_field] = 'nothing'
             else:
                 raise ValueError(f"Expected str or dict, got {type(item)}")
+            default_turn = [t for t in trcfg.prompt if t["role"] == "user"]
+            default_turn = default_turn[0]["slots"] if default_turn else {}
+            for k, dv in (("source_lang", "en"), ("target_lang", "en"), ("taskname", "asr"), ("pnc", "yes")):
+                if k not in entry:
+                    # last-chance fallback injecting legacy Canary defaults if none were provided.
+                    entry[k] = default_turn.get(k, dv)
             out_json_items.append(entry)
         return out_json_items
 
@@ -974,3 +1002,76 @@ def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signa
 
         text = [self.decoding.strip_special_tokens(t) for t in text]
         return text
+
+
+def parse_multitask_prompt(prompt: dict | None) -> list[dict]:
+    if prompt is None or not prompt:
+        return []
+
+    # Case 1.
+    # Multi-turn prompting format. This format conforms to PromptFormatter API and needs no further modification.
+    # This format allows to condition the model on chat history, system+user prompts, etc.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     turns=[
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'
+    #             ),
+    #         ),
+    #         dict(
+    #             role="assistant",
+    #             slots=dict(message="Calculating the translation of given text. Do you want to proceed?"),
+    #         ),
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='Yes, please proceed.'
+    #             ),
+    #         ),
+    #     ],
+    # )
+    if 'turns' in prompt:
+        assert (
+            len(prompt) == 1
+            and isinstance(prompt["turns"], list)
+            and all(isinstance(t, dict) and "role" in t and "slots" in t for t in prompt["turns"])
+        ), (
+            f"When providing a multi-turn prompt through 'turns', no other keys are allowed "
+            f"and the value under prompt['turns'] must be a list of dicts with roles and slot values "
+            f"(we received {prompt=})"
+        )
+        return prompt["turns"]
+
+    values_are_dicts = any(isinstance(v, dict) for k, v in prompt.items() if k != "slots")
+    assert not values_are_dicts, (
+        f"We don't support dict values for prompt keys other than 'slots'. " f"We received {prompt=}"
+    )
+
+    # Case 2.
+    # Single-turn prompting format with explicitly provided role and slot names and values.
+    # We create a 1-item multi-turn prompt from this input.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     role="user",
+    #     slots=dict(source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'),
+    # )
+    if "role" in prompt and "slots" in prompt:
+        assert isinstance(prompt["slots"], dict), (
+            f"When providing a single-turn prompt through 'role', 'slots' must also be provided "
+            f"(we received {prompt=})."
+        )
+        return [prompt]
+
+    # Case 3.
+    # Legacy prompting format for Canary-1B preserved for backward compatibility.
+    # Extra fields are converted to a single-turn prompt with role "user" (unless overridden with 'role').
+    # Example:
+    # model.transcribe(
+    #     audio, pnc=True, source_lang='en', target_lang='de', task='asr', context='translate this text'
+    # )
+    role = prompt.pop("role", "user")
+    return [dict(role=role, slots=prompt)]
diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py
index 7df0ae9fb689..4f8e82293d48 100644
--- a/nemo/collections/asr/models/asr_model.py
+++ b/nemo/collections/asr/models/asr_model.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from abc import ABC, abstractmethod
-from typing import List
+from abc import ABC
+from typing import List, Optional
 
 import torch
 
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.exportable import Exportable
@@ -171,6 +172,52 @@ def on_after_backward(self):
                 logging.warning(f'detected inf or nan values in gradients! Setting gradients to zero.')
                 self.zero_grad()
 
+    def on_train_epoch_start(self) -> None:
+        """
+        Decoder with CUDA graphs does not release memory, thus we disable it for training epoch.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_train_epoch_end(self) -> None:
+        """
+        After training, we can enable the decoder with CUDA graphs.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_validation_epoch_start(self) -> None:
+        """
+        For validation, we enable CUDA graphs to speedup validation.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_validation_epoch_end(self) -> Optional[dict[str, dict[str, torch.Tensor]]]:
+        """
+        After validation, we disable CUDA graphs, since `validation` can be called in training loop, and
+        training will continue after validation
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+        return super().on_validation_epoch_end()
+
+    def on_test_epoch_start(self) -> None:
+        """
+        For testing, we enable CUDA graphs to speedup validation.
+        We do not need to disable CUDA graphs after testing, since `test` cannot be called in training loop.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_predict_epoch_start(self) -> None:
+        """
+        For predicting, we enable CUDA graphs to speedup validation.
+        We do not need to disable CUDA graphs after predicting, since `predict` cannot be called in training loop.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
 
 class ExportableEncDecModel(Exportable):
     """
@@ -203,9 +250,9 @@ def forward_for_export(
         """
         This forward is used when we need to export the model to ONNX format.
         Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
+
         Args:
-            input: Tensor that represents a batch of raw audio signals,
-                of shape [B, T]. T here represents timesteps.
+            input: Tensor that represents a batch of raw audio signals of shape [B, T]. T here represents timesteps.
             length: Vector of length B, that contains the individual lengths of the audio sequences.
             cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
             cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
diff --git a/nemo/collections/asr/models/audio_to_audio_model.py b/nemo/collections/asr/models/audio_to_audio_model.py
index 49364843e8b8..094dbc38b72a 100644
--- a/nemo/collections/asr/models/audio_to_audio_model.py
+++ b/nemo/collections/asr/models/audio_to_audio_model.py
@@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import os
+import tempfile
 from abc import ABC, abstractmethod
-from typing import List, Union
+from typing import Dict, List, Optional, Union
 
 import hydra
+import librosa
+import soundfile as sf
 import torch
 from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning import Trainer
+from tqdm import tqdm
 
+from nemo.collections.asr.data import audio_to_audio_dataset
+from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
+from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
 from nemo.collections.asr.metrics.audio import AudioMetricWrapper
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes import ModelPT
 from nemo.utils import logging, model_utils
 
@@ -158,23 +169,384 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
     def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
         return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test')
 
-    @abstractmethod
+    @torch.no_grad()
     def process(
-        self, paths2audio_files: List[str], output_dir: str, batch_size: int = 4
-    ) -> List[Union[str, List[str]]]:
+        self,
+        paths2audio_files: List[str],
+        output_dir: str,
+        batch_size: int = 1,
+        num_workers: Optional[int] = None,
+        input_channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> List[str]:
+        """
+        Process audio files provided in paths2audio_files.
+        Processed signals will be saved in output_dir.
+
+        Args:
+            paths2audio_files: (a list) of paths to audio files. \
+                Recommended length per file is between 5 and 25 seconds. \
+                But it is possible to pass a few hours long file if enough GPU memory is available.
+            output_dir: 
+            batch_size: (int) batch size to use during inference.
+                Bigger will result in better throughput performance but would use more memory.
+            num_workers: Number of workers for the dataloader
+            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
+
+        Returns:
+        """
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if num_workers is None:
+            num_workers = min(batch_size, os.cpu_count() - 1)
+
+        # Output
+        paths2processed_files = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+
+        try:
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze weights
+            self.freeze()
+
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+
+            # Processing
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Save temporary manifest
+                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
+                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {
+                    'manifest_filepath': temporary_manifest_filepath,
+                    'input_key': 'input_filepath',
+                    'input_channel_selector': input_channel_selector,
+                    'batch_size': min(batch_size, len(paths2audio_files)),
+                    'num_workers': num_workers,
+                }
+
+                # Create output dir if necessary
+                if not os.path.isdir(output_dir):
+                    os.makedirs(output_dir)
+
+                # DataLoader for the input files
+                temporary_dataloader = self._setup_process_dataloader(config)
+
+                # Indexing of the original files, used to form the output file name
+                file_idx = 0
+
+                # Process batches
+                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
+                    input_signal = test_batch[0]
+                    input_length = test_batch[1]
+
+                    # Expand channel dimension, if necessary
+                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+                    if input_signal.ndim == 2:
+                        input_signal = input_signal.unsqueeze(1)
+
+                    processed_batch, _ = self.forward(
+                        input_signal=input_signal.to(device), input_length=input_length.to(device)
+                    )
+
+                    for example_idx in range(processed_batch.size(0)):
+                        # This assumes the data loader is not shuffling files
+                        file_name = os.path.basename(paths2audio_files[file_idx])
+                        # Prepare output file
+                        output_file = os.path.join(output_dir, f'processed_{file_name}')
+                        # Crop the output signal to the actual length
+                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
+                        # Write audio
+                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
+                        # Update the file counter
+                        file_idx += 1
+                        # Save processed file
+                        paths2processed_files.append(output_file)
+
+                    del test_batch
+                    del processed_batch
+
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            if mode is True:
+                self.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return paths2processed_files
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+
+        if config.get("use_lhotse", False):
+            return get_lhotse_dataloader_from_config(
+                config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset()
+            )
+
+        is_concat = config.get('is_concat', False)
+        if is_concat:
+            raise NotImplementedError('Concat not implemented')
+
+        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
+        # Automatically inject args from model config to dataloader config
+        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
+
+        # Instantiate tarred dataset loader or normal dataset loader
+        if config.get('is_tarred', False):
+            raise NotImplementedError('Tarred datasets not supported')
+
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+
+        dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config)
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        elif hasattr(dataset.datasets[0], 'collate_fn'):
+            # support datasets that are lists of entries
+            collate_fn = dataset.datasets[0].collate_fn
+        else:
+            # support datasets that are lists of lists
+            collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=config['shuffle'],
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the training data loader via a Dict-like object.
+
+        Args:
+            train_data_config: A config that contains the information regarding construction
+                of a training dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in train_data_config:
+            train_data_config['shuffle'] = True
+
+        # preserve config
+        self._update_dataset_config(dataset_name='train', config=train_data_config)
+
+        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
+
+        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
+            raise NotImplementedError('Tarred datasets not supported')
+
+    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the validation data loader via a Dict-like object.
+
+        Args:
+            val_data_config: A config that contains the information regarding construction
+                of a validation dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in val_data_config:
+            val_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='validation', config=val_data_config)
+
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+
+    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the test data loader via a Dict-like object.
+
+        Args:
+            test_data_config: A config that contains the information regarding construction
+                of a test dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in test_data_config:
+            test_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='test', config=test_data_config)
+
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+
+    def _setup_process_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """Prepare a dataloader for processing files.
+
+        Args:
+            config: A python dictionary which contains the following keys:
+                manifest_filepath: path to a manifest file
+                input_key: key with audio filepaths in the manifest
+                input_channel_selector: Optional, used to select a subset of channels from input audio files
+                batch_size: batch size for the dataloader
+                num_workers: number of workers for the dataloader
+
+        Returns:
+            A pytorch DataLoader for the given manifest filepath.
+        """
+        dl_config = {
+            'manifest_filepath': config['manifest_filepath'],
+            'sample_rate': self.sample_rate,
+            'input_key': config['input_key'],
+            'input_channel_selector': config.get('input_channel_selector', None),
+            'target_key': None,
+            'target_channel_selector': None,
+            'batch_size': config['batch_size'],
+            'shuffle': False,
+            'num_workers': config.get('num_workers', min(config['batch_size'], os.cpu_count() - 1)),
+            'pin_memory': True,
+        }
+
+        temporary_dataloader = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_dataloader
+
+    @staticmethod
+    def match_batch_length(input: torch.Tensor, batch_length: int) -> torch.Tensor:
+        """Trim or pad the output to match the batch length.
+
+        Args:
+            input: tensor with shape (B, C, T)
+            batch_length: int
+
+        Returns:
+            Tensor with shape (B, C, T), where T matches the
+            batch length.
+        """
+        input_length = input.size(-1)
+        pad_length = batch_length - input_length
+        pad = (0, pad_length)
+        # pad with zeros or crop
+        return torch.nn.functional.pad(input, pad, 'constant', 0)
+
+    @torch.no_grad()
+    def process(
+        self,
+        paths2audio_files: List[str],
+        output_dir: str,
+        batch_size: int = 1,
+        num_workers: Optional[int] = None,
+        input_channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> List[str]:
         """
         Takes paths to audio files and returns a list of paths to processed
         audios.
 
         Args:
             paths2audio_files: paths to audio files to be processed
-            output_dir: directory to save processed files
-            batch_size: batch size for inference
+            output_dir: directory to save the processed files
+            batch_size: (int) batch size to use during inference.
+            num_workers: Number of workers for the dataloader
+            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio.
+                            If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
 
         Returns:
             Paths to processed audio signals.
         """
-        pass
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if num_workers is None:
+            num_workers = min(batch_size, os.cpu_count() - 1)
+
+        # Output
+        paths2processed_files = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+
+        try:
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze weights
+            self.freeze()
+
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+
+            # Processing
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Save temporary manifest
+                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
+                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {
+                    'manifest_filepath': temporary_manifest_filepath,
+                    'input_key': 'input_filepath',
+                    'input_channel_selector': input_channel_selector,
+                    'batch_size': min(batch_size, len(paths2audio_files)),
+                    'num_workers': num_workers,
+                }
+
+                # Create output dir if necessary
+                if not os.path.isdir(output_dir):
+                    os.makedirs(output_dir)
+
+                # DataLoader for the input files
+                temporary_dataloader = self._setup_process_dataloader(config)
+
+                # Indexing of the original files, used to form the output file name
+                file_idx = 0
+
+                # Process batches
+                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
+                    input_signal = test_batch[0]
+                    input_length = test_batch[1]
+
+                    # Expand channel dimension, if necessary
+                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+                    if input_signal.ndim == 2:
+                        input_signal = input_signal.unsqueeze(1)
+
+                    processed_batch, _ = self.forward(
+                        input_signal=input_signal.to(device), input_length=input_length.to(device)
+                    )
+
+                    for example_idx in range(processed_batch.size(0)):
+                        # This assumes the data loader is not shuffling files
+                        file_name = os.path.basename(paths2audio_files[file_idx])
+                        # Prepare output file
+                        output_file = os.path.join(output_dir, f'processed_{file_name}')
+                        # Crop the output signal to the actual length
+                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
+                        # Write audio
+                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
+                        # Update the file counter
+                        file_idx += 1
+                        # Save processed file
+                        paths2processed_files.append(output_file)
+
+                    del test_batch
+                    del processed_batch
+
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            if mode is True:
+                self.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return paths2processed_files
 
     @classmethod
     def list_available_models(cls) -> 'List[PretrainedModelInfo]':
diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py
index c1294de5bdc0..7b226f59e364 100644
--- a/nemo/collections/asr/models/classification_models.py
+++ b/nemo/collections/asr/models/classification_models.py
@@ -15,7 +15,6 @@
 import copy
 import json
 import os
-import tempfile
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from math import ceil, floor
@@ -24,6 +23,7 @@
 import torch
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from torchmetrics import Accuracy
 from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
 
@@ -169,7 +169,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
         # Crop or pad is always applied
         if self.crop_or_pad is not None:
@@ -355,7 +356,7 @@ def _setup_feature_label_dataloader(self, config: DictConfig) -> torch.utils.dat
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         logprobs=None,
         override_config: Optional[ClassificationInferConfig] | Optional[RegressionInferConfig] = None,
@@ -364,7 +365,8 @@ def transcribe(
         Generate class labels for provided audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a single or list) of paths to audio files or a np.ndarray audio sample. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is approximately 1 second.
             batch_size: (int) batch size to use during inference. \
                 Bigger will result in better throughput performance but would use more memory.
@@ -952,7 +954,10 @@ def _setup_dataloader_from_config(self, config: DictConfig):
 
             shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
             dataset = audio_to_label_dataset.get_tarred_audio_multi_label_dataset(
-                cfg=config, shuffle_n=shuffle_n, global_rank=self.global_rank, world_size=self.world_size,
+                cfg=config,
+                shuffle_n=shuffle_n,
+                global_rank=self.global_rank,
+                world_size=self.world_size,
             )
             shuffle = False
             if hasattr(dataset, 'collate_fn'):
@@ -1022,7 +1027,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         # Crop or pad is always applied
@@ -1124,7 +1130,7 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
     def reshape_labels(self, logits, labels, logits_len, labels_len):
         """
         Reshape labels to match logits shape. For example, each label is expected to cover a 40ms frame, while each frme prediction from the
-        model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain 
+        model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain
         the label of each frame. When lengths of labels and logits are not factors of each other, labels are truncated or padded with zeros.
         The ratio_threshold=0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio
         is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. This implementation does not allow frame length
diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index 533f276c0018..93913a43c1b5 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -74,10 +74,10 @@ def get_available_model_names(class_name):
 
 class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin):
     """
-    Inference model Class for offline speaker diarization. 
-    This class handles required functionality for diarization : Speech Activity Detection, Segmentation, 
-    Extract Embeddings, Clustering, Resegmentation and Scoring. 
-    All the parameters are passed through config file 
+    Inference model Class for offline speaker diarization.
+    This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
+    Extract Embeddings, Clustering, Resegmentation and Scoring.
+    All the parameters are passed through config file
     """
 
     def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None):
@@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None):
         Initialize speaker embedding model with model name or path passed through config
         """
         if speaker_model is not None:
-            self._speaker_model = speaker_model
+            if self._cfg.device is None and torch.cuda.is_available():
+                self._speaker_model = speaker_model.to(torch.device('cuda'))
+            else:
+                self._speaker_model = speaker_model
         else:
             model_path = self._cfg.diarizer.speaker_embeddings.model_path
             if model_path is not None and model_path.endswith('.nemo'):
@@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None):
                 self._speaker_model = EncDecSpeakerLabelModel.from_pretrained(
                     model_name=model_path, map_location=self._cfg.device
                 )
-
         self.multiscale_args_dict = parse_scale_configs(
             self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec,
             self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec,
@@ -171,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input):
             'sample_rate': self._cfg.sample_rate,
             'batch_size': self._cfg.get('batch_size'),
             'vad_stream': True,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'window_length_in_sec': self._vad_window_length_in_sec,
             'shift_length_in_sec': self._vad_shift_length_in_sec,
             'trim_silence': False,
@@ -192,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file):
 
     def _run_vad(self, manifest_file):
         """
-        Run voice activity detection. 
-        Get log probability of voice activity detection and smoothes using the post processing parameters. 
+        Run voice activity detection.
+        Get log probability of voice activity detection and smoothes using the post processing parameters.
         Using generated frame level predictions generated manifest file for later speaker embedding extraction.
         input:
         manifest_file (str) : Manifest file containing path to audio file and label as infer
@@ -338,7 +342,7 @@ def _perform_speech_activity_detection(self):
     def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int):
         """
         This method extracts speaker embeddings from segments passed through manifest_file
-        Optionally you may save the intermediate speaker embeddings for debugging or any use. 
+        Optionally you may save the intermediate speaker embeddings for debugging or any use.
         """
         logging.info("Extracting embeddings for Diarization")
         self._setup_spkr_test_data(manifest_file)
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 4df02b1177cd..093419c3ca0c 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -22,6 +22,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
@@ -119,7 +120,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
     def transcribe(
         self,
-        audio: Union[str, List[str], torch.Tensor, np.ndarray],
+        audio: Union[str, List[str], torch.Tensor, np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -135,7 +136,8 @@ def transcribe(
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a single or list) of paths to audio files or a np.ndarray audio array. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -493,7 +495,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -579,7 +582,9 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len)
 
         transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
-            decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+            decoder_outputs=log_probs,
+            decoder_lengths=encoded_len,
+            return_hypotheses=False,
         )
 
         sample_id = sample_id.cpu().detach().numpy()
@@ -601,11 +606,19 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
             log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
         )
         loss_value, metrics = self.add_interctc_losses(
-            loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_",
+            loss_value,
+            transcript,
+            transcript_len,
+            compute_wer=True,
+            log_wer_num_denom=True,
+            log_prefix="val_",
         )
 
         self.wer.update(
-            predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
+            predictions=log_probs,
+            targets=transcript,
+            targets_lengths=transcript_len,
+            predictions_lengths=encoded_len,
         )
         wer, wer_num, wer_denom = self.wer.compute()
         self.wer.reset()
@@ -655,7 +668,7 @@ def test_dataloader(self):
     def _transcribe_on_begin(self, audio, trcfg: TranscribeConfig):
         super()._transcribe_on_begin(audio, trcfg)
 
-        # Freeze the encoder and decoure_exder modules
+        # Freeze the encoder and decoder modules
         self.encoder.freeze()
         self.decoder.freeze()
 
@@ -677,7 +690,9 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
         logits_len = outputs.pop('logits_len')
 
         current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
-            logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses,
+            logits,
+            decoder_lengths=logits_len,
+            return_hypotheses=trcfg.return_hypotheses,
         )
         if trcfg.return_hypotheses:
             if logits.is_cuda:
@@ -691,7 +706,11 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
             logits_len = logits_len.cpu()
             # dump log probs per file
             for idx in range(logits_cpu.shape[0]):
-                current_hypotheses[idx].y_sequence = logits_cpu[idx][: logits_len[idx]]
+                # We clone because we don't want references to the
+                # cudaMallocHost()-allocated tensor to be floating
+                # around. Were that to be the case, then the pinned
+                # memory cache would always miss.
+                current_hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone()
                 if current_hypotheses[idx].alignments is None:
                     current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
             del logits_cpu
diff --git a/nemo/collections/asr/models/enhancement_models.py b/nemo/collections/asr/models/enhancement_models.py
index b80c357364aa..b765ae0fddad 100644
--- a/nemo/collections/asr/models/enhancement_models.py
+++ b/nemo/collections/asr/models/enhancement_models.py
@@ -16,6 +16,8 @@
 import tempfile
 from typing import Dict, List, Optional, Union
 
+import einops
+import hydra
 import librosa
 import soundfile as sf
 import torch
@@ -23,17 +25,13 @@
 from pytorch_lightning import Trainer
 from tqdm import tqdm
 
-from nemo.collections.asr.data import audio_to_audio_dataset
-from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
-from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
+
 from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel
-from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
-from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
-from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType
+from nemo.core.neural_types import AudioSignal, LengthsType, LossType, NeuralType
 from nemo.utils import logging
 
-__all__ = ['EncMaskDecAudioToAudioModel']
+__all__ = ['EncMaskDecAudioToAudioModel', 'ScoreBasedGenerativeAudioToAudioModel', 'PredictiveAudioToAudioModel']
 
 
 class EncMaskDecAudioToAudioModel(AudioToAudioModel):
@@ -69,10 +67,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             logging.debug('Mixture consistency not used')
             self.mixture_consistency = None
 
-        # Future enhancement:
-        # If subclasses need to modify the config before calling super()
-        # Check ASRBPE* classes do with their mixin
-
         # Setup augmentation
         if hasattr(self.cfg, 'channel_augment') and self.cfg.channel_augment is not None:
             logging.debug('Using channel augmentation')
@@ -84,254 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # Setup optional Optimization flags
         self.setup_optimization_flags()
 
-    @torch.no_grad()
-    def process(
-        self,
-        paths2audio_files: List[str],
-        output_dir: str,
-        batch_size: int = 1,
-        num_workers: Optional[int] = None,
-        input_channel_selector: Optional[ChannelSelectorType] = None,
-    ) -> List[str]:
-        """
-        Process audio files provided in paths2audio_files.
-        Processed signals will be saved in output_dir.
-
-        Args:
-            paths2audio_files: (a list) of paths to audio files. \
-                Recommended length per file is between 5 and 25 seconds. \
-                But it is possible to pass a few hours long file if enough GPU memory is available.
-            output_dir: 
-            batch_size: (int) batch size to use during inference.
-                Bigger will result in better throughput performance but would use more memory.
-            num_workers: Number of workers for the dataloader
-            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
-
-        Returns:
-        """
-        if paths2audio_files is None or len(paths2audio_files) == 0:
-            return {}
-
-        if num_workers is None:
-            num_workers = min(batch_size, os.cpu_count() - 1)
-
-        # Output
-        paths2processed_files = []
-
-        # Model's mode and device
-        mode = self.training
-        device = next(self.parameters()).device
-
-        try:
-            # Switch model to evaluation mode
-            self.eval()
-            # Freeze weights
-            self.freeze()
-
-            logging_level = logging.get_verbosity()
-            logging.set_verbosity(logging.WARNING)
-
-            # Processing
-            with tempfile.TemporaryDirectory() as tmpdir:
-                # Save temporary manifest
-                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
-                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
-                    for audio_file in paths2audio_files:
-                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
-                        fp.write(json.dumps(entry) + '\n')
-
-                config = {
-                    'manifest_filepath': temporary_manifest_filepath,
-                    'input_key': 'input_filepath',
-                    'input_channel_selector': input_channel_selector,
-                    'batch_size': min(batch_size, len(paths2audio_files)),
-                    'num_workers': num_workers,
-                }
-
-                # Create output dir if necessary
-                if not os.path.isdir(output_dir):
-                    os.makedirs(output_dir)
-
-                # DataLoader for the input files
-                temporary_dataloader = self._setup_process_dataloader(config)
-
-                # Indexing of the original files, used to form the output file name
-                file_idx = 0
-
-                # Process batches
-                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
-                    input_signal = test_batch[0]
-                    input_length = test_batch[1]
-
-                    # Expand channel dimension, if necessary
-                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
-                    if input_signal.ndim == 2:
-                        input_signal = input_signal.unsqueeze(1)
-
-                    processed_batch, _ = self.forward(
-                        input_signal=input_signal.to(device), input_length=input_length.to(device)
-                    )
-
-                    for example_idx in range(processed_batch.size(0)):
-                        # This assumes the data loader is not shuffling files
-                        file_name = os.path.basename(paths2audio_files[file_idx])
-                        # Prepare output file
-                        output_file = os.path.join(output_dir, f'processed_{file_name}')
-                        # Crop the output signal to the actual length
-                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
-                        # Write audio
-                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
-                        # Update the file counter
-                        file_idx += 1
-                        # Save processed file
-                        paths2processed_files.append(output_file)
-
-                    del test_batch
-                    del processed_batch
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-            if mode is True:
-                self.unfreeze()
-            logging.set_verbosity(logging_level)
-
-        return paths2processed_files
-
-    def _setup_dataloader_from_config(self, config: Optional[Dict]):
-
-        if config.get("use_lhotse", False):
-            return get_lhotse_dataloader_from_config(
-                config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset()
-            )
-
-        is_concat = config.get('is_concat', False)
-        if is_concat:
-            raise NotImplementedError('Concat not implemented')
-
-        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
-        # Automatically inject args from model config to dataloader config
-        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
-
-        # Instantiate tarred dataset loader or normal dataset loader
-        if config.get('is_tarred', False):
-            raise NotImplementedError('Tarred datasets not supported')
-
-        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
-            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
-            return None
-
-        dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config)
-
-        if hasattr(dataset, 'collate_fn'):
-            collate_fn = dataset.collate_fn
-        elif hasattr(dataset.datasets[0], 'collate_fn'):
-            # support datasets that are lists of entries
-            collate_fn = dataset.datasets[0].collate_fn
-        else:
-            # support datasets that are lists of lists
-            collate_fn = dataset.datasets[0].datasets[0].collate_fn
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=config['batch_size'],
-            collate_fn=collate_fn,
-            drop_last=config.get('drop_last', False),
-            shuffle=config['shuffle'],
-            num_workers=config.get('num_workers', 0),
-            pin_memory=config.get('pin_memory', False),
-        )
-
-    def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the training data loader via a Dict-like object.
-
-        Args:
-            train_data_config: A config that contains the information regarding construction
-                of a training dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in train_data_config:
-            train_data_config['shuffle'] = True
-
-        # preserve config
-        self._update_dataset_config(dataset_name='train', config=train_data_config)
-
-        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
-
-        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
-            raise NotImplementedError('Tarred datasets not supported')
-
-    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the validation data loader via a Dict-like object.
-
-        Args:
-            val_data_config: A config that contains the information regarding construction
-                of a validation dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in val_data_config:
-            val_data_config['shuffle'] = False
-
-        # preserve config
-        self._update_dataset_config(dataset_name='validation', config=val_data_config)
-
-        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the test data loader via a Dict-like object.
-
-        Args:
-            test_data_config: A config that contains the information regarding construction
-                of a test dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in test_data_config:
-            test_data_config['shuffle'] = False
-
-        # preserve config
-        self._update_dataset_config(dataset_name='test', config=test_data_config)
-
-        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
-
-    def _setup_process_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
-        """Prepare a dataloader for processing files.
-
-        Args:
-            config: A python dictionary which contains the following keys:
-                manifest_filepath: path to a manifest file
-                input_key: key with audio filepaths in the manifest
-                input_channel_selector: Optional, used to select a subset of channels from input audio files
-                batch_size: batch size for the dataloader
-                num_workers: number of workers for the dataloader
-
-        Returns:
-            A pytorch DataLoader for the given manifest filepath.
-        """
-        dl_config = {
-            'manifest_filepath': config['manifest_filepath'],
-            'sample_rate': self.sample_rate,
-            'input_key': config['input_key'],
-            'input_channel_selector': config.get('input_channel_selector', None),
-            'target_key': None,
-            'target_channel_selector': None,
-            'batch_size': config['batch_size'],
-            'shuffle': False,
-            'num_workers': config.get('num_workers', min(config['batch_size'], os.cpu_count() - 1)),
-            'pin_memory': True,
-        }
-
-        temporary_dataloader = self._setup_dataloader_from_config(config=DictConfig(dl_config))
-        return temporary_dataloader
-
     @property
     def input_types(self) -> Dict[str, NeuralType]:
         return {
@@ -350,23 +96,6 @@ def output_types(self) -> Dict[str, NeuralType]:
             "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
         }
 
-    def match_batch_length(self, input: torch.Tensor, batch_length: int):
-        """Trim or pad the output to match the batch length.
-
-        Args:
-            input: tensor with shape (B, C, T)
-            batch_length: int
-
-        Returns:
-            Tensor with shape (B, C, T), where T matches the
-            batch length.
-        """
-        input_length = input.size(-1)
-        pad_length = batch_length - input_length
-        pad = (0, pad_length)
-        # pad with zeros or crop
-        return torch.nn.functional.pad(input, pad, 'constant', 0)
-
     @typecheck()
     def forward(self, input_signal, input_length=None):
         """
@@ -380,6 +109,7 @@ def forward(self, input_signal, input_length=None):
                 sequences.
 
         Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
         """
         batch_length = input_signal.size(-1)
 
@@ -414,12 +144,11 @@ def training_step(self, batch, batch_idx):
         else:
             input_signal, input_length, target_signal, _ = batch
 
-        # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
         if input_signal.ndim == 2:
-            input_signal = input_signal.unsqueeze(1)
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
         if target_signal.ndim == 2:
-            target_signal = target_signal.unsqueeze(1)
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
 
         # Apply channel augmentation
         if self.training and self.channel_augmentation is not None:
@@ -449,12 +178,11 @@ def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str =
         else:
             input_signal, input_length, target_signal, _ = batch
 
-        # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
         if input_signal.ndim == 2:
-            input_signal = input_signal.unsqueeze(1)
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
         if target_signal.ndim == 2:
-            target_signal = target_signal.unsqueeze(1)
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
 
         # Process input
         processed_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
@@ -485,3 +213,406 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         results = []
 
         return results
+
+
+class PredictiveAudioToAudioModel(AudioToAudioModel):
+    """This models aims to directly estimate the coefficients
+    in the encoded domain by applying a neural model.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Term added to the denominator to improve numerical stability
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+        logging.debug('\tnormalize_input: %s', self.normalize_input)
+        logging.debug('\teps:             %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model.
+        
+        Args:
+            input_signal: time-domain signal
+            input_length: valid length of each example in the batch
+        
+        Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        # Backbone
+        estimated, estimated_length = self.estimator(input=encoded, input_length=encoded_length)
+
+        # Decoder
+        output, output_length = self.decoder(input=estimated, input_length=estimated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+        return output, output_length
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Estimate the signal
+        output_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
+
+        # Calculate the loss
+        loss = self.loss(estimate=output_signal, target=target_signal, input_length=input_length)
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Estimate the signal
+        output_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
+
+        # Prepare output
+        loss = self.loss(estimate=output_signal, target=target_signal, input_length=input_length)
+
+        # Update metrics
+        if hasattr(self, 'metrics') and tag in self.metrics:
+            # Update metrics for this (tag, dataloader_idx)
+            for name, metric in self.metrics[tag][dataloader_idx].items():
+                metric.update(preds=output_signal, target=target_signal, input_length=input_length)
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
+
+
+class ScoreBasedGenerativeAudioToAudioModel(AudioToAudioModel):
+    """This models is using a score-based diffusion process to generate
+    an encoded representation of the enhanced signal.
+    
+    The model consists of the following blocks:
+        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
+        - estimator: neural model, estimates a score for the diffusion process
+        - sde: stochastic differential equation (SDE) defining the forward and reverse diffusion process
+        - sampler: sampler for the reverse diffusion process, estimates coefficients of the target signal
+        - decoder: transforms sampler output into the time domain (synthesis transform)
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural score estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+
+        # SDE
+        self.sde = self.from_config_dict(self._cfg.sde)
+
+        # Sampler
+        if 'sde' in self._cfg.sampler:
+            raise ValueError('SDE should be defined in the model config, not in the sampler config')
+        if 'score_estimator' in self._cfg.sampler:
+            raise ValueError('Score estimator should be defined in the model config, not in the sampler config')
+
+        self.sampler = hydra.utils.instantiate(self._cfg.sampler, sde=self.sde, score_estimator=self.estimator)
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Metric evaluation
+        self.max_utts_evaluation_metrics = self._cfg.get('max_utts_evaluation_metrics')
+
+        if self.max_utts_evaluation_metrics is not None:
+            logging.warning(
+                'Metrics will be evaluated on first %d examples of the evaluation datasets.',
+                self.max_utts_evaluation_metrics,
+            )
+
+        # Term added to the denominator to improve numerical stability
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+        logging.debug('\tnormalize_input: %s', self.normalize_input)
+        logging.debug('\teps:             %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    @torch.inference_mode()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model.
+
+        Forward pass of the model aplies the following steps:
+            - encoder to obtain the encoded representation of the input signal
+            - sampler to generate the estimated coefficients of the target signal
+            - decoder to transform the sampler output into the time domain
+
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+
+        Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        # Sampler
+        generated, generated_length = self.sampler(
+            prior_mean=encoded, score_condition=encoded, state_length=encoded_length
+        )
+
+        # Decoder
+        output, output_length = self.decoder(input=generated, input_length=generated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+        return output, output_length
+
+    @typecheck(
+        input_types={
+            "target_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_length": NeuralType(tuple('B'), LengthsType()),
+        },
+        output_types={"loss": NeuralType(None, LossType()),},
+    )
+    def _step(self, target_signal, input_signal, input_length=None):
+        """Randomly generate a time step for each example in the batch, estimate
+        the score and calculate the loss value.
+
+        Note that this step does not include sampler.
+        """
+        batch_size = target_signal.size(0)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+            # scale the target signal
+            target_signal = target_signal / (norm_scale + self.eps)
+
+        # Apply encoder to both target and the input
+        input_enc, input_enc_len = self.encoder(input=input_signal, input_length=input_length)
+        target_enc, _ = self.encoder(input=target_signal, input_length=input_length)
+
+        # Generate random time steps
+        sde_time = self.sde.generate_time(size=batch_size, device=input_enc.device)
+
+        # Get the mean and the variance of the perturbation kernel
+        pk_mean, pk_std = self.sde.perturb_kernel_params(state=target_enc, prior_mean=input_enc, time=sde_time)
+
+        # Generate a random sample from a standard normal distribution
+        z_norm = torch.randn_like(input_enc)
+
+        # Prepare perturbed data
+        perturbed_enc = pk_mean + pk_std * z_norm
+
+        # Score is conditioned on the perturbed data and the input
+        estimator_input = torch.cat([perturbed_enc, input_enc], dim=-3)
+
+        # Estimate the score using the neural estimator
+        # SDE time is used to inform the estimator about the current time step
+        # Note:
+        # - some implementations use `score = -self._raw_dnn_output(x, t, y)`
+        # - this seems to be unimportant, and is an artifact of transfering code from the original Song's repo
+        score_est, score_len = self.estimator(input=estimator_input, input_length=input_enc_len, condition=sde_time)
+
+        # Score loss weighting as in Section 4.2 in http://arxiv.org/abs/1907.05600
+        score_est = score_est * pk_std
+        score_ref = -z_norm
+
+        # Score matching loss on the normalized scores
+        loss = self.loss(estimate=score_est, target=score_ref, input_length=score_len)
+
+        return loss
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate the loss
+        loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate loss
+        loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Update metrics
+        update_metrics = False
+        if self.max_utts_evaluation_metrics is None:
+            # Always update if max is not configured
+            update_metrics = True
+            # Number of examples to process
+            num_examples = input_signal.size(0)  # batch size
+        else:
+            # Check how many examples have been used for metric calculation
+            first_metric_name = next(iter(self.metrics[tag][dataloader_idx]))
+            num_examples_evaluated = self.metrics[tag][dataloader_idx][first_metric_name].num_examples
+            # Update metrics if some examples were not processed
+            update_metrics = num_examples_evaluated < self.max_utts_evaluation_metrics
+            # Number of examples to process
+            num_examples = min(self.max_utts_evaluation_metrics - num_examples_evaluated, input_signal.size(0))
+
+        if update_metrics:
+            # Generate output signal
+            output_signal, _ = self.forward(
+                input_signal=input_signal[:num_examples, ...], input_length=input_length[:num_examples]
+            )
+
+            # Update metrics
+            if hasattr(self, 'metrics') and tag in self.metrics:
+                # Update metrics for this (tag, dataloader_idx)
+                for name, metric in self.metrics[tag][dataloader_idx].items():
+                    metric.update(
+                        preds=output_signal,
+                        target=target_signal[:num_examples, ...],
+                        input_length=input_length[:num_examples],
+                    )
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 3eaab9961ef8..9a5c4188aebd 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -111,7 +111,8 @@ def transcribe(
 
         Args:
 
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference. \
@@ -182,7 +183,9 @@ def _transcribe_output_processing(
         encoded_len = outputs.pop('encoded_len')
 
         best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor(
-            logits, encoded_len, return_hypotheses=trcfg.return_hypotheses,
+            logits,
+            encoded_len,
+            return_hypotheses=trcfg.return_hypotheses,
         )
         logits = logits.cpu()
 
@@ -554,7 +557,10 @@ def validation_pass(self, batch, batch_idx, dataloader_idx):
             loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss
             tensorboard_logs['val_loss'] = loss_value
         self.ctc_wer.update(
-            predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
+            predictions=log_probs,
+            targets=transcript,
+            targets_lengths=transcript_len,
+            predictions_lengths=encoded_len,
         )
         ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute()
         self.ctc_wer.reset()
diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
index ba5489839db4..9de47645d4f3 100644
--- a/nemo/collections/asr/models/label_models.py
+++ b/nemo/collections/asr/models/label_models.py
@@ -136,7 +136,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if 'loss' in cfg:
             cfg_eval_loss = copy.deepcopy(cfg.loss)
 
-            if 'angular' in cfg.loss._target_:
+            if '_target_' in cfg.loss and 'angular' in cfg.loss._target_:
                 OmegaConf.set_struct(cfg, True)
                 with open_dict(cfg):
                     cfg.decoder.angular = True
@@ -341,7 +341,8 @@ def forward_for_export(self, audio_signal, length):
     @typecheck()
     def forward(self, input_signal, input_signal_length):
         processed_signal, processed_signal_len = self.preprocessor(
-            input_signal=input_signal, length=input_signal_length,
+            input_signal=input_signal,
+            length=input_signal_length,
         )
 
         if self.spec_augmentation is not None and self.training:
@@ -627,7 +628,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
         dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=dataset.fixed_seq_collate_fn,
         )
 
         logits = []
diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index d96bafd5af9b..01926eb4ae79 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -400,10 +400,15 @@ def get_cluster_avg_embs_model(
                 multi-scale input tensors during forward propagating.
 
                 Example: `batch_size=3, scale_n=6, emb_dim=192`
-                    ms_seg_counts =  
-                     [[8,  9, 12, 16, 25, 51],  
-                      [11, 13, 14, 17, 25, 51],  
-                      [ 9,  9, 11, 16, 23, 50]]  
+                    .. code:: python
+
+                        ms_seg_counts =
+                            [
+                                [ 8,  9, 12, 16, 25, 51],
+                                [11, 13, 14, 17, 25, 51],
+                                [ 9,  9, 11, 16, 23, 50]
+                            ]
+
                     Counts of merged segments: (121, 131, 118)  
                     embs has shape of (370, 192)  
                     clus_label_index has shape of (3, 131)  
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 386f2a915142..cb2505fbadbf 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 import copy
-import json
 import os
-import tempfile
 from math import ceil
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
-from tqdm.auto import tqdm
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
@@ -101,7 +100,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.cfg.decoding = self.set_decoding_type_according_to_loss(self.cfg.decoding)
         # Setup decoding objects
         self.decoding = RNNTDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            vocabulary=self.joint.vocabulary,
         )
         # Setup WER calculation
         self.wer = WER(
@@ -236,7 +238,7 @@ def set_decoding_type_according_to_loss(self, decoding_cfg):
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         partial_hypothesis: Optional[List['Hypothesis']] = None,
@@ -250,7 +252,8 @@ def transcribe(
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference. \
@@ -338,7 +341,10 @@ def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[Di
             decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
             self.decoding = RNNTDecoding(
-                decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+                decoding_cfg=decoding_cfg,
+                decoder=self.decoder,
+                joint=self.joint,
+                vocabulary=self.joint.vocabulary,
             )
 
             self.wer = WER(
@@ -394,7 +400,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
         decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
         self.decoding = RNNTDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            vocabulary=self.joint.vocabulary,
         )
 
         self.wer = WER(
@@ -649,7 +658,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         # Spec augment is not applied during evaluation/testing
diff --git a/nemo/collections/asr/models/slu_models.py b/nemo/collections/asr/models/slu_models.py
index 1303bbfde7ea..c599b7f4272a 100644
--- a/nemo/collections/asr/models/slu_models.py
+++ b/nemo/collections/asr/models/slu_models.py
@@ -13,15 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
-import tempfile
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
-from tqdm.auto import tqdm
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
@@ -190,7 +188,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -278,7 +277,8 @@ def predict(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -560,7 +560,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -571,7 +571,8 @@ def transcribe(
         Use this method for debugging and prototyping.
 
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 21a5f34b3038..e7e67f8fbb2f 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -24,6 +24,7 @@
 import torch.distributed as dist
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from torchmetrics.text import SacreBLEUScore
 from tqdm.auto import tqdm
 
@@ -141,7 +142,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             num_layers=self.cfg.head.num_layers,
         )
         self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
-        std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5
+        std_init_range = 1 / self.transf_decoder.hidden_size**0.5
         self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
@@ -174,7 +175,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -185,7 +186,8 @@ def transcribe(
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a list) of paths to audio files.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -225,7 +227,9 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 config,
                 global_rank=self.global_rank,
                 world_size=self.world_size,
-                dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer,),
+                dataset=LhotseSpeechToTextBpeDataset(
+                    tokenizer=self.tokenizer,
+                ),
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
diff --git a/nemo/collections/asr/modules/audio_modules.py b/nemo/collections/asr/modules/audio_modules.py
index 82cfbefeb8d9..67a923099cde 100644
--- a/nemo/collections/asr/modules/audio_modules.py
+++ b/nemo/collections/asr/modules/audio_modules.py
@@ -17,7 +17,7 @@
 import numpy as np
 import torch
 
-from nemo.collections.asr.losses.audio_losses import temporal_mean
+from nemo.collections.asr.losses.audio_losses import calculate_mean
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder
 from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like
 from nemo.collections.asr.parts.submodules.multichannel_modules import (
@@ -39,6 +39,7 @@
     'MaskReferenceChannel',
     'MaskBasedBeamformer',
     'MaskBasedDereverbWPE',
+    'MixtureConsistencyProjection',
 ]
 
 
@@ -158,7 +159,7 @@ def get_mean_time_channel(input: torch.Tensor, input_length: Optional[torch.Tens
             mean = torch.mean(input, dim=(-1, -3), keepdim=True)
         else:
             # temporal mean
-            mean = temporal_mean(input, input_length, keepdim=True)
+            mean = calculate_mean(input, input_length, dim=-1, keepdim=True)
             # channel mean
             mean = torch.mean(mean, dim=-3, keepdim=True)
 
@@ -186,7 +187,7 @@ def get_mean_std_time_channel(
             mean = cls.get_mean_time_channel(input, input_length)
             std = (input - mean).pow(2)
             # temporal mean
-            std = temporal_mean(std, input_length, keepdim=True)
+            std = calculate_mean(std, input_length, dim=-1, keepdim=True)
             # channel mean
             std = torch.mean(std, dim=-3, keepdim=True)
             # final value
diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index cc5312403255..2dca468fab35 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -39,7 +39,7 @@
 )
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 try:
     import torchaudio
@@ -66,8 +66,8 @@
 
 class AudioPreprocessor(NeuralModule, ABC):
     """
-        An interface for Neural Modules that performs audio pre-processing,
-        transforming the wav files to features.
+    An interface for Neural Modules that performs audio pre-processing,
+    transforming the wav files to features.
     """
 
     def __init__(self, win_length, hop_length):
@@ -85,11 +85,27 @@ def __init__(self, win_length, hop_length):
             None: torch.ones,
         }
 
+        # Normally, when you call to(dtype) on a torch.nn.Module, all
+        # floating point parameters and buffers will change to that
+        # dtype, rather than being float32. The AudioPreprocessor
+        # classes, uniquely, don't actually have any parameters or
+        # buffers from what I see. In addition, we want the input to
+        # the preprocessor to be float32, but need to create the
+        # output in appropriate precision. We have this empty tensor
+        # here just to detect which dtype tensor this module should
+        # output at the end of execution.
+        self.register_buffer("dtype_sentinel_tensor", torch.tensor((), dtype=torch.float32), persistent=False)
+
     @typecheck()
     @torch.no_grad()
     def forward(self, input_signal, length):
-        processed_signal, processed_length = self.get_features(input_signal, length)
-
+        if input_signal.dtype != torch.float32:
+            logging.warn(
+                f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision  mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.",
+                mode=logging_mode.ONCE,
+            )
+        processed_signal, processed_length = self.get_features(input_signal.to(torch.float32), length)
+        processed_signal = processed_signal.to(self.dtype_sentinel_tensor.dtype)
         return processed_signal, processed_length
 
     @abstractmethod
@@ -101,72 +117,72 @@ def get_features(self, input_signal, length):
 class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable):
     """Featurizer module that converts wavs to mel spectrograms.
 
-        Args:
-            sample_rate (int): Sample rate of the input audio data.
-                Defaults to 16000
-            window_size (float): Size of window for fft in seconds
-                Defaults to 0.02
-            window_stride (float): Stride of window for fft in seconds
-                Defaults to 0.01
-            n_window_size (int): Size of window for fft in samples
-                Defaults to None. Use one of window_size or n_window_size.
-            n_window_stride (int): Stride of window for fft in samples
-                Defaults to None. Use one of window_stride or n_window_stride.
-            window (str): Windowing function for fft. can be one of ['hann',
-                'hamming', 'blackman', 'bartlett']
-                Defaults to "hann"
-            normalize (str): Can be one of ['per_feature', 'all_features']; all
-                other options disable feature normalization. 'all_features'
-                normalizes the entire spectrogram to be mean 0 with std 1.
-                'pre_features' normalizes per channel / freq instead.
-                Defaults to "per_feature"
-            n_fft (int): Length of FT window. If None, it uses the smallest power
-                of 2 that is larger than n_window_size.
-                Defaults to None
-            preemph (float): Amount of pre emphasis to add to audio. Can be
-                disabled by passing None.
-                Defaults to 0.97
-            features (int): Number of mel spectrogram freq bins to output.
-                Defaults to 64
-            lowfreq (int): Lower bound on mel basis in Hz.
-                Defaults to 0
-            highfreq  (int): Lower bound on mel basis in Hz.
-                Defaults to None
-            log (bool): Log features.
-                Defaults to True
-            log_zero_guard_type(str): Need to avoid taking the log of zero. There
-                are two options: "add" or "clamp".
-                Defaults to "add".
-            log_zero_guard_value(float, or str): Add or clamp requires the number
-                to add with or clamp to. log_zero_guard_value can either be a float
-                or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
-                passed.
-                Defaults to 2**-24.
-            dither (float): Amount of white-noise dithering.
-                Defaults to 1e-5
-            pad_to (int): Ensures that the output size of the time dimension is
-                a multiple of pad_to.
-                Defaults to 16
-            frame_splicing (int): Defaults to 1
-            exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
-                // hop_length. Defaults to False.
-            pad_value (float): The value that shorter mels are padded with.
-                Defaults to 0
-            mag_power (float): The power that the linear spectrogram is raised to
-                prior to multiplication with mel basis.
-                Defaults to 2 for a power spec
-            rng : Random number generator
-            nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
-                samples in the batch.
-                Defaults to 0.0
-            nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
-                Defaults to 4000
-            use_torchaudio: Whether to use the `torchaudio` implementation.
-            mel_norm: Normalization used for mel filterbank weights.
-                Defaults to 'slaney' (area normalization)
-            stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
-            stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
-        """
+    Args:
+        sample_rate (int): Sample rate of the input audio data.
+            Defaults to 16000
+        window_size (float): Size of window for fft in seconds
+            Defaults to 0.02
+        window_stride (float): Stride of window for fft in seconds
+            Defaults to 0.01
+        n_window_size (int): Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride (int): Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window (str): Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett']
+            Defaults to "hann"
+        normalize (str): Can be one of ['per_feature', 'all_features']; all
+            other options disable feature normalization. 'all_features'
+            normalizes the entire spectrogram to be mean 0 with std 1.
+            'pre_features' normalizes per channel / freq instead.
+            Defaults to "per_feature"
+        n_fft (int): Length of FT window. If None, it uses the smallest power
+            of 2 that is larger than n_window_size.
+            Defaults to None
+        preemph (float): Amount of pre emphasis to add to audio. Can be
+            disabled by passing None.
+            Defaults to 0.97
+        features (int): Number of mel spectrogram freq bins to output.
+            Defaults to 64
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        log (bool): Log features.
+            Defaults to True
+        log_zero_guard_type(str): Need to avoid taking the log of zero. There
+            are two options: "add" or "clamp".
+            Defaults to "add".
+        log_zero_guard_value(float, or str): Add or clamp requires the number
+            to add with or clamp to. log_zero_guard_value can either be a float
+            or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
+            passed.
+            Defaults to 2**-24.
+        dither (float): Amount of white-noise dithering.
+            Defaults to 1e-5
+        pad_to (int): Ensures that the output size of the time dimension is
+            a multiple of pad_to.
+            Defaults to 16
+        frame_splicing (int): Defaults to 1
+        exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
+            // hop_length. Defaults to False.
+        pad_value (float): The value that shorter mels are padded with.
+            Defaults to 0
+        mag_power (float): The power that the linear spectrogram is raised to
+            prior to multiplication with mel basis.
+            Defaults to 2 for a power spec
+        rng : Random number generator
+        nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
+            samples in the batch.
+            Defaults to 0.0
+        nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
+            Defaults to 4000
+        use_torchaudio: Whether to use the `torchaudio` implementation.
+        mel_norm: Normalization used for mel filterbank weights.
+            Defaults to 'slaney' (area normalization)
+        stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
+        stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
+    """
 
     def save_to(self, save_path: str):
         pass
@@ -177,8 +193,7 @@ def restore_from(cls, restore_path: str):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
             "length": NeuralType(
@@ -218,7 +233,7 @@ def __init__(
         highfreq=None,
         log=True,
         log_zero_guard_type="add",
-        log_zero_guard_value=2 ** -24,
+        log_zero_guard_value=2**-24,
         dither=1e-5,
         pad_to=16,
         frame_splicing=1,
@@ -335,8 +350,7 @@ class AudioToMFCCPreprocessor(AudioPreprocessor):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -344,8 +358,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "processed_signal": NeuralType(('B', 'D', 'T'), MFCCSpectrogramType()),
             "processed_length": NeuralType(tuple('B'), LengthsType()),
@@ -463,12 +476,14 @@ class SpectrogramAugmentation(NeuralModule):
         rect_time (int): maximum size of cut rectangles along the time
             dimension
             Defaults to 25.
+        use_numba_spec_augment: use numba code for Spectrogram augmentation
+        use_vectorized_spec_augment: use vectorized code for Spectrogram augmentation
+
     """
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -476,8 +491,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
@@ -491,12 +505,18 @@ def __init__(
         rect_freq=20,
         rng=None,
         mask_value=0.0,
-        use_numba_spec_augment: bool = True,
+        use_vectorized_spec_augment: bool = True,
+        use_numba_spec_augment: bool = False,
     ):
         super().__init__()
 
         if rect_masks > 0:
-            self.spec_cutout = SpecCutout(rect_masks=rect_masks, rect_time=rect_time, rect_freq=rect_freq, rng=rng,)
+            self.spec_cutout = SpecCutout(
+                rect_masks=rect_masks,
+                rect_time=rect_time,
+                rect_freq=rect_freq,
+                rng=rng,
+            )
             # self.spec_cutout.to(self._device)
         else:
             self.spec_cutout = lambda input_spec: input_spec
@@ -508,6 +528,7 @@ def __init__(
                 time_width=time_width,
                 rng=rng,
                 mask_value=mask_value,
+                use_vectorized_code=use_vectorized_spec_augment,
             )
         else:
             self.spec_augment = lambda input_spec, length: input_spec
@@ -541,26 +562,25 @@ def forward(self, input_spec, length):
 
 class MaskedPatchAugmentation(NeuralModule):
     """
-        Zeroes out fixed size time patches of the spectrogram.
-        All samples in batch are guaranteed to have the same amount of masked time steps.
-        Optionally also performs frequency masking in the same way as SpecAugment.
-        Args:
-            patch_size (int): up to how many time steps does one patch consist of.
-                Defaults to 48.
-            mask_patches (float): how many patches should be masked in each sample.
-                if >= 1., interpreted as number of patches (after converting to int)
-                if <1.,   interpreted as fraction of total tokens to be masked (number of patches is rounded up)
-                Defaults to 10.
-            freq_masks (int): how many frequency segments should be cut.
-                Defaults to 0.
-            freq_width (int): maximum number of frequencies to be cut in a segment.
-                Defaults to 0.
+    Zeroes out fixed size time patches of the spectrogram.
+    All samples in batch are guaranteed to have the same amount of masked time steps.
+    Optionally also performs frequency masking in the same way as SpecAugment.
+    Args:
+        patch_size (int): up to how many time steps does one patch consist of.
+            Defaults to 48.
+        mask_patches (float): how many patches should be masked in each sample.
+            if >= 1., interpreted as number of patches (after converting to int)
+            if <1.,   interpreted as fraction of total tokens to be masked (number of patches is rounded up)
+            Defaults to 10.
+        freq_masks (int): how many frequency segments should be cut.
+            Defaults to 0.
+        freq_width (int): maximum number of frequencies to be cut in a segment.
+            Defaults to 0.
     """
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -568,12 +588,15 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
-        self, patch_size: int = 48, mask_patches: float = 10.0, freq_masks: int = 0, freq_width: int = 0,
+        self,
+        patch_size: int = 48,
+        mask_patches: float = 10.0,
+        freq_masks: int = 0,
+        freq_width: int = 0,
     ):
         super().__init__()
         self.patch_size = patch_size
@@ -586,7 +609,12 @@ def __init__(
             raise ValueError('mask_patches cannot be negative')
 
         if freq_masks > 0:
-            self.spec_augment = SpecAugment(freq_masks=freq_masks, time_masks=0, freq_width=freq_width, time_width=0,)
+            self.spec_augment = SpecAugment(
+                freq_masks=freq_masks,
+                time_masks=0,
+                freq_width=freq_width,
+                time_width=0,
+            )
         else:
             self.spec_augment = None
 
@@ -676,8 +704,7 @@ def forward(self, input_signal, length):
 
     @property
     def input_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -685,8 +712,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "processed_length": NeuralType(tuple('B'), LengthsType()),
@@ -709,9 +735,11 @@ class AudioToSpectrogram(NeuralModule):
         hop_length: length of hops/shifts of the sliding window
         power: exponent for magnitude spectrogram. Default `None` will
                return a complex-valued spectrogram
+        magnitude_power: Transform magnitude of the spectrogram as x^magnitude_power.
+        scale: Positive scaling of the spectrogram.
     """
 
-    def __init__(self, fft_length: int, hop_length: int, power: Optional[float] = None):
+    def __init__(self, fft_length: int, hop_length: int, magnitude_power: float = 1.0, scale: float = 1.0):
         if not HAVE_TORCHAUDIO:
             logging.error('Could not import torchaudio. Some features might not work.')
 
@@ -726,20 +754,33 @@ def __init__(self, fft_length: int, hop_length: int, power: Optional[float] = No
             raise ValueError(f'fft_length = {fft_length} must be divisible by 2')
 
         self.stft = torchaudio.transforms.Spectrogram(
-            n_fft=fft_length, hop_length=hop_length, power=power, pad_mode='constant'
+            n_fft=fft_length, hop_length=hop_length, power=None, pad_mode='constant'
         )
 
         # number of subbands
         self.F = fft_length // 2 + 1
 
+        if magnitude_power <= 0:
+            raise ValueError(f'Magnitude power needs to be positive: current value {magnitude_power}')
+        self.magnitude_power = magnitude_power
+
+        if scale <= 0:
+            raise ValueError(f'Scale needs to be positive: current value {scale}')
+        self.scale = scale
+
+        logging.debug('Initialized %s with:', self.__class__.__name__)
+        logging.debug('\tfft_length:      %s', fft_length)
+        logging.debug('\thop_length:      %s', hop_length)
+        logging.debug('\tmagnitude_power: %s', magnitude_power)
+        logging.debug('\tscale:           %s', scale)
+
     @property
     def num_subbands(self) -> int:
         return self.F
 
     @property
     def input_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input": NeuralType(('B', 'C', 'T'), AudioSignal()),
             "input_length": NeuralType(('B',), LengthsType(), optional=True),
@@ -747,8 +788,7 @@ def input_types(self) -> Dict[str, NeuralType]:
 
     @property
     def output_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
             "output_length": NeuralType(('B',), LengthsType()),
@@ -776,6 +816,14 @@ def forward(
         with torch.cuda.amp.autocast(enabled=False):
             output = self.stft(input.float())
 
+            if self.magnitude_power != 1:
+                # apply power on the magnitude
+                output = torch.pow(output.abs(), self.magnitude_power) * torch.exp(1j * output.angle())
+
+            if self.scale != 1:
+                # apply scaling of the coefficients
+                output = self.scale * output
+
         if input_length is not None:
             # Mask padded frames
             output_length = self.get_output_length(input_length=input_length)
@@ -810,11 +858,11 @@ class SpectrogramToAudio(NeuralModule):
     Args:
         fft_length: length of FFT
         hop_length: length of hops/shifts of the sliding window
-        power: exponent for magnitude spectrogram. Default `None` will
-               return a complex-valued spectrogram
+        magnitude_power: Transform magnitude of the spectrogram as x^(1/magnitude_power).
+        scale: Spectrogram will be scaled with 1/scale before the inverse transform.
     """
 
-    def __init__(self, fft_length: int, hop_length: int):
+    def __init__(self, fft_length: int, hop_length: int, magnitude_power: float = 1.0, scale: float = 1.0):
         if not HAVE_TORCHAUDIO:
             logging.error('Could not import torchaudio. Some features might not work.')
 
@@ -834,14 +882,27 @@ def __init__(self, fft_length: int, hop_length: int):
 
         self.F = fft_length // 2 + 1
 
+        if magnitude_power <= 0:
+            raise ValueError(f'Magnitude power needs to be positive: current value {magnitude_power}')
+        self.magnitude_power = magnitude_power
+
+        if scale <= 0:
+            raise ValueError(f'Scale needs to be positive: current value {scale}')
+        self.scale = scale
+
+        logging.debug('Initialized %s with:', self.__class__.__name__)
+        logging.debug('\tfft_length:      %s', fft_length)
+        logging.debug('\thop_length:      %s', hop_length)
+        logging.debug('\tmagnitude_power: %s', magnitude_power)
+        logging.debug('\tscale:           %s', scale)
+
     @property
     def num_subbands(self) -> int:
         return self.F
 
     @property
     def input_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
             "input_length": NeuralType(('B',), LengthsType(), optional=True),
@@ -849,8 +910,7 @@ def input_types(self) -> Dict[str, NeuralType]:
 
     @property
     def output_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "output": NeuralType(('B', 'C', 'T'), AudioSignal()),
             "output_length": NeuralType(('B',), LengthsType()),
@@ -875,7 +935,16 @@ def forward(self, input: torch.Tensor, input_length: Optional[torch.Tensor] = No
 
         # iSTFT output (B, C, T)
         with torch.cuda.amp.autocast(enabled=False):
-            output = self.istft(input.cfloat())
+            output = input.cfloat()
+
+            if self.scale != 1:
+                # apply 1/scale on the coefficients
+                output = output / self.scale
+
+            if self.magnitude_power != 1:
+                # apply 1/power on the magnitude
+                output = torch.pow(output.abs(), 1 / self.magnitude_power) * torch.exp(1j * output.angle())
+            output = self.istft(output)
 
         if input_length is not None:
             # Mask padded samples
@@ -921,7 +990,7 @@ class AudioToMelSpectrogramPreprocessorConfig:
     highfreq: Optional[int] = None
     log: bool = True
     log_zero_guard_type: str = "add"
-    log_zero_guard_value: float = 2 ** -24
+    log_zero_guard_value: float = 2**-24
     dither: float = 1e-5
     pad_to: int = 16
     frame_splicing: int = 1
@@ -968,7 +1037,8 @@ class SpectrogramAugmentationConfig:
     rect_freq: int = 0
     mask_value: float = 0
     rng: Optional[Any] = None  # random.Random() type
-    use_numba_spec_augment: bool = True
+    use_numba_spec_augment: bool = False
+    use_vectorized_spec_augment: bool = True
 
 
 @dataclass
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index b31aecdc880a..245404a7601c 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -16,7 +16,7 @@
 import random
 from collections import OrderedDict
 from dataclasses import dataclass
-from typing import List, Optional, Set
+from typing import List, Optional, Set, Tuple
 
 import torch
 import torch.distributed
@@ -118,6 +118,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
             Defaults to None.
         conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases.
             Defaults to False
+        use_bias (bool): Use bias in all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
         dropout (float): the dropout rate used in all layers except the attention layers
             Defaults to 0.1.
         dropout_pre_encoder (float): the dropout rate used before the encoder
@@ -282,6 +284,7 @@ def __init__(
         conv_kernel_size=31,
         conv_norm_type='batch_norm',
         conv_context_size=None,
+        use_bias=True,
         dropout=0.1,
         dropout_pre_encoder=0.1,
         dropout_emb=0.1,
@@ -356,7 +359,9 @@ def __init__(
         if reduction and reduction_factor > 1:
             assert reduction_position >= -1 and reduction_position < n_layers
             self.reduction_subsampling = SubsamplingReductionModule(
-                reduction=reduction, d_model=d_model, reduction_factor=reduction_factor,
+                reduction=reduction,
+                d_model=d_model,
+                reduction_factor=reduction_factor,
             )
             self.reduction_position = reduction_position
         else:
@@ -424,6 +429,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 att_context_size=self.att_context_size,
+                use_bias=use_bias,
             )
             self.layers.append(layer)
 
@@ -672,7 +678,8 @@ def set_max_audio_length(self, max_audio_length):
         """
         self.max_audio_length = max_audio_length
         device = next(self.parameters()).device
-        self.pos_enc.extend_pe(max_audio_length, device)
+        dtype = next(self.parameters()).dtype
+        self.pos_enc.extend_pe(max_audio_length, device, dtype)
 
     def _create_masks(self, att_context_size, padding_length, max_audio_length, offset, device):
         if self.self_attention_model != "rel_pos_local_attn":
@@ -803,15 +810,15 @@ def setup_streaming_params(
         max_context: int = 10000,
     ):
         """
-            This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
-            The streaming configuration is needed to simulate streaming inference.
-
-            Args:
-                chunk_size (int): overrides the chunk size
-                shift_size (int): overrides the shift size for chunks
-                left_chunks (int): overrides the number of left chunks visible to each chunk
-                max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
-                    Defaults to -1 (means feat_out is d_model)
+        This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
+        The streaming configuration is needed to simulate streaming inference.
+
+        Args:
+            chunk_size (int): overrides the chunk size
+            shift_size (int): overrides the shift size for chunks
+            left_chunks (int): overrides the number of left chunks visible to each chunk
+            max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
+                Defaults to -1 (means feat_out is d_model)
         """
         streaming_cfg = CacheAwareStreamingConfig()
 
@@ -902,12 +909,19 @@ def get_initial_cache_state(self, batch_size=1, dtype=torch.float32, device=None
             create_tensor = torch.zeros
         last_time_cache_size = self.conv_context_size[0]
         cache_last_channel = create_tensor(
-            (len(self.layers), batch_size, self.streaming_cfg.last_channel_cache_size, self.d_model,),
+            (
+                len(self.layers),
+                batch_size,
+                self.streaming_cfg.last_channel_cache_size,
+                self.d_model,
+            ),
             device=device,
             dtype=dtype,
         )
         cache_last_time = create_tensor(
-            (len(self.layers), batch_size, self.d_model, last_time_cache_size), device=device, dtype=dtype,
+            (len(self.layers), batch_size, self.d_model, last_time_cache_size),
+            device=device,
+            dtype=dtype,
         )
         if max_dim > 0:
             cache_last_channel_len = torch.randint(
@@ -933,7 +947,6 @@ def change_attention_model(
         update_config: bool = True,
         device: torch.device = None,
     ):
-
         """
         Update the self_attention_model which changes the positional encoding and attention layers.
 
@@ -1052,7 +1065,7 @@ def change_attention_model(
 
     def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int):
         """
-        Update the conv_chunking_factor (int) 
+        Update the conv_chunking_factor (int)
         Default is 1 (auto)
         Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers
 
@@ -1097,7 +1110,9 @@ def _update_adapter_cfg_input_dim(self, cfg: DictConfig):
         cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model)
         return cfg
 
-    def get_accepted_adapter_types(self,) -> Set[type]:
+    def get_accepted_adapter_types(
+        self,
+    ) -> Set[type]:
         types = super().get_accepted_adapter_types()
 
         if len(types) == 0:
@@ -1112,6 +1127,85 @@ def get_accepted_adapter_types(self,) -> Set[type]:
         return types
 
 
+class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable, AccessMixin):
+    """
+    A wrapper module that extracts features from multiple layers of a ConformerEncoder,
+    by reusing existing mechanisim for interctc loss.
+    To use it, set `layer_idx_list` to  specify the indices of layers to extract from.
+    Also, you can specify an `aggretator` module to aggregate the features from different layers, default not aggregating.
+    """
+
+    def __init__(
+        self,
+        encoder: ConformerEncoder,
+        layer_idx_list: List[int],
+        aggregator: NeuralModule = None,
+        detach: bool = False,
+        convert_to_cpu: bool = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.layer_idx_list = [int(l) for l in layer_idx_list]
+        for x in self.layer_idx_list:
+            if x < 0 or x >= len(encoder.layers):
+                raise ValueError(f"layer index {x} out of range [0, {len(encoder.layers)})")
+        self.enc_access_cfg = {
+            "interctc": {
+                "capture_layers": self.layer_idx_list,
+            },
+            "detach": detach,
+            "convert_to_cpu": convert_to_cpu,
+        }
+        self.aggregator = aggregator
+
+    def forward(
+        self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        old_access_flag = self.is_access_enabled(guid=getattr(self, "model_guid", None))
+        self.update_access_cfg(self.enc_access_cfg, guid=getattr(self, "model_guid", None))
+        self.set_access_enabled(access_enabled=True, guid=getattr(self, "model_guid", None))
+
+        _ = self.encoder(
+            audio_signal=audio_signal,
+            length=length,
+            cache_last_channel=cache_last_channel,
+            cache_last_time=cache_last_time,
+            cache_last_channel_len=cache_last_channel_len,
+        )
+
+        ### chunk of code adapted from ConformerEncoder.forward_internal()
+        total_registry = {}
+        for module_registry in self.get_module_registry(self.encoder).values():
+            for key in module_registry:
+                if key.startswith("interctc/") and key in total_registry:
+                    raise RuntimeError(f"layer {key} has been logged multiple times!")
+            total_registry.update(module_registry)
+
+        encoded_list = []
+        encoded_len_list = []
+        for layer_idx in self.layer_idx_list:
+            try:
+                layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"]
+                layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"]
+            except KeyError:
+                raise RuntimeError(
+                    f"Intermediate layer {layer_idx} was not captured! Check the layer index and the number of ConformerEncoder layers."
+                )
+            if len(layer_outputs) > 1 or len(layer_lengths) > 1:
+                raise RuntimeError("Make sure encoder.forward is called exactly one time")
+            encoded_list.append(layer_outputs[0])  # [B, D, T]
+            encoded_len_list.append(layer_lengths[0])  # [B]
+
+        self.encoder.reset_registry()
+        self.set_access_enabled(access_enabled=old_access_flag, guid=getattr(self, "model_guid", None))
+        ### end of adapted chunk
+
+        if self.aggregator is not None:
+            return self.aggregator(encoded_list, encoded_len_list)  # Tensor[B,D*L,T], Tensor[B]
+        else:
+            return encoded_list, encoded_len_list  # List[Tensor[B,D,T]], List[Tensor[B]]
+
+
 """
 Register any additional information
 """
diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 5a7457f6379d..2355cfb7005b 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -312,7 +312,9 @@ def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
         batch = y.size(0)
         # state contains context_size - 1 elements for each utterance in batch,
         # consistent with the state returned from StatelessNet.forward
-        state = [torch.ones([batch, self.context_size - 1], dtype=torch.long, device=y.device) * self.blank_idx]
+        state = [
+            torch.full([batch, self.context_size - 1], fill_value=self.blank_idx, dtype=torch.long, device=y.device)
+        ]
         return state
 
     def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
@@ -1559,13 +1561,13 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens
         NOTE:
             The implementation of this model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
 
         Args:
             f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
@@ -2050,8 +2052,7 @@ def sampled_joint(
         """
         Compute the sampled joint step of the network.
 
-        # Reference
-        - [Memory-Efficient Training of RNN-Transducer with Sampled Softmax](https://arxiv.org/abs/2203.16868)
+        Reference: `Memory-Efficient Training of RNN-Transducer with Sampled Softmax <https://arxiv.org/abs/2203.16868>`__.
 
         Here,
         B = Batch size
@@ -2065,13 +2066,13 @@ def sampled_joint(
         NOTE:
             The implementation of this joint model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
             (V_Neg is sampled not uniformly by as a rand permutation of all vocab tokens, then eliminate
             all Intersection(V_Pos, V_Neg) common tokens to avoid duplication of loss) ->
             Concat new Vocab V_Sampled = Union(V_Pos, V_Neg)
diff --git a/nemo/collections/asr/modules/squeezeformer_encoder.py b/nemo/collections/asr/modules/squeezeformer_encoder.py
index ce0d49843d4f..ae779380edf6 100644
--- a/nemo/collections/asr/modules/squeezeformer_encoder.py
+++ b/nemo/collections/asr/modules/squeezeformer_encoder.py
@@ -99,8 +99,7 @@ def input_example(self, max_batch=1, max_dim=256):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return OrderedDict(
             {
                 "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
@@ -110,8 +109,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return OrderedDict(
             {
                 "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
@@ -253,7 +251,11 @@ def __init__(
             # Chose same type of positional encoding as the originally determined above
             if self_attention_model == "rel_pos":
                 self.time_reduce_pos_enc = RelPositionalEncoding(
-                    d_model=d_model, dropout_rate=0.0, max_len=pos_emb_max_len, xscale=None, dropout_rate_emb=0.0,
+                    d_model=d_model,
+                    dropout_rate=0.0,
+                    max_len=pos_emb_max_len,
+                    xscale=None,
+                    dropout_rate_emb=0.0,
                 )
             else:
                 self.time_reduce_pos_enc = PositionalEncoding(
@@ -275,20 +277,21 @@ def __init__(
         self.interctc_capture_at_layers = None
 
     def set_max_audio_length(self, max_audio_length):
-        """ Sets maximum input length.
-            Pre-calculates internal seq_range mask.
+        """Sets maximum input length.
+        Pre-calculates internal seq_range mask.
         """
         self.max_audio_length = max_audio_length
         device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
         seq_range = torch.arange(0, self.max_audio_length, device=device)
         if hasattr(self, 'seq_range'):
             self.seq_range = seq_range
         else:
             self.register_buffer('seq_range', seq_range, persistent=False)
-        self.pos_enc.extend_pe(max_audio_length, device)
+        self.pos_enc.extend_pe(max_audio_length, device, dtype)
 
         if self.time_reduce_pos_enc is not None:
-            self.time_reduce_pos_enc.extend_pe(max_audio_length, device)
+            self.time_reduce_pos_enc.extend_pe(max_audio_length, device, dtype)
 
     @typecheck()
     def forward(self, audio_signal, length=None):
@@ -434,7 +437,9 @@ def _update_adapter_cfg_input_dim(self, cfg: DictConfig):
         cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model)
         return cfg
 
-    def get_accepted_adapter_types(self,) -> Set[type]:
+    def get_accepted_adapter_types(
+        self,
+    ) -> Set[type]:
         types = super().get_accepted_adapter_types()
 
         if len(types) == 0:
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index 5a71679607be..5b9461d0a389 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -67,18 +67,18 @@ class TranscribeConfig:
     _internal: Optional[InternalTranscribeConfig] = None
 
 
-def move_to_device(batch, device):
+def move_to_device(batch, device, non_blocking=False):
     """
     Recursively move all tensors in `batch` to `device`.
     """
     if isinstance(batch, torch.Tensor):
-        return batch.to(device)
+        return batch.to(device, non_blocking=non_blocking)
     elif isinstance(batch, (list, tuple)):
-        return [move_to_device(x, device) for x in batch]
+        return [move_to_device(x, device, non_blocking) for x in batch]
     elif isinstance(batch, dict):
-        return {k: move_to_device(v, device) for k, v in batch.items()}
+        return {k: move_to_device(v, device, non_blocking) for k, v in batch.items()}
     else:
-        raise TypeError(f"Unsupported type: {type(batch)}")
+        return batch  # do nothing if not supported type
 
 
 def get_value_from_transcription_config(trcfg, key, default):
@@ -148,11 +148,9 @@ def get_item(self, index):
         # Calculate seq length
         seq_len = torch.tensor(samples.shape[0], dtype=torch.long)
 
-        # Dummy text tokens
-        text_tokens = torch.tensor([0], dtype=torch.long)
-        text_tokens_len = torch.tensor(1, dtype=torch.long)
-
-        return (samples, seq_len, text_tokens, text_tokens_len)
+        # Typically NeMo ASR models expect the mini-batch to be a 4-tuple of (audio, audio_len, text, text_len).
+        # For inference, we set text and text_len to None to not disrupt the shape of the tuple.
+        return samples, seq_len, None, None
 
 
 class TranscriptionMixin(ABC):
@@ -183,10 +181,10 @@ class TranscriptionMixin(ABC):
 
     """
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def transcribe(
         self,
-        audio: Union[str, List[str], np.ndarray],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -201,6 +199,7 @@ def transcribe(
 
         Args:
             audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds.
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -368,7 +367,11 @@ def transcribe_generator(self, audio, override_config: Optional[TranscribeConfig
             with tempfile.TemporaryDirectory() as tmpdir:
                 transcribe_cfg._internal.temp_dir = tmpdir
 
-                dataloader = self._transcribe_input_processing(audio, transcribe_cfg)
+                # Create a DataLoader if not already present
+                if not isinstance(audio, DataLoader):
+                    dataloader = self._transcribe_input_processing(audio, transcribe_cfg)
+                else:
+                    dataloader = audio
 
                 if hasattr(transcribe_cfg, 'verbose'):
                     verbose = transcribe_cfg.verbose
@@ -378,7 +381,6 @@ def transcribe_generator(self, audio, override_config: Optional[TranscribeConfig
                 for test_batch in tqdm(dataloader, desc="Transcribing", disable=not verbose):
                     # Move batch to device
                     test_batch = move_to_device(test_batch, transcribe_cfg._internal.device)
-
                     # Run forward pass
                     model_outputs = self._transcribe_forward(test_batch, transcribe_cfg)
                     processed_outputs = self._transcribe_output_processing(model_outputs, transcribe_cfg)
diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
index 8479611b3513..51fc6c2418f7 100644
--- a/nemo/collections/asr/parts/preprocessing/features.py
+++ b/nemo/collections/asr/parts/preprocessing/features.py
@@ -60,17 +60,33 @@ def normalize_batch(x, seq_len, normalize_type):
     x_mean = None
     x_std = None
     if normalize_type == "per_feature":
-        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        for i in range(x.shape[0]):
-            if x[i, :, : seq_len[i]].shape[1] == 1:
-                raise ValueError(
-                    "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
-                    "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
-                    "feature (ex. at least `hop_length` for Mel Spectrograms)."
-                )
-            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
-            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
+        batch_size = x.shape[0]
+        max_time = x.shape[2]
+
+        # When doing stream capture to a graph, item() is not allowed
+        # becuase it calls cudaStreamSynchronize(). Therefore, we are
+        # sacrificing some error checking when running with cuda graphs.
+        if (
+            torch.cuda.is_available()
+            and not torch.cuda.is_current_stream_capturing()
+            and torch.any(seq_len == 1).item()
+        ):
+            raise ValueError(
+                "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                "feature (ex. at least `hop_length` for Mel Spectrograms)."
+            )
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        valid_mask = time_steps < seq_len.unsqueeze(1)
+        x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+        x_mean_denominator = valid_mask.sum(axis=1)
+        x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+
+        # Subtract 1 in the denominator to correct for the bias.
+        x_std = torch.sqrt(
+            torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2)
+            / (x_mean_denominator.unsqueeze(1) - 1.0)
+        )
         # make sure x_std is not zero
         x_std += CONSTANT
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
@@ -460,7 +476,7 @@ def forward(self, x, seq_len, linear_spec=False):
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         max_len = x.size(-1)
-        mask = torch.arange(max_len).to(x.device)
+        mask = torch.arange(max_len, device=x.device)
         mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
         x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
         del mask
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index aed6cc16245c..093cde63c439 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -56,6 +56,8 @@ class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin):
         conv_kernel_size (int): kernel size for depthwise convolution in convolution module
         dropout (float): dropout probabilities for linear layers
         dropout_att (float): dropout probabilities for attention distributions
+        use_bias (bool): Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
     """
 
     def __init__(
@@ -75,6 +77,7 @@ def __init__(
         pos_bias_u=None,
         pos_bias_v=None,
         att_context_size=[-1, -1],
+        use_bias=True,
     ):
         super(ConformerLayer, self).__init__()
 
@@ -84,7 +87,7 @@ def __init__(
 
         # first feed forward module
         self.norm_feed_forward1 = LayerNorm(d_model)
-        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         # convolution module
         self.norm_conv = LayerNorm(d_model)
@@ -93,6 +96,7 @@ def __init__(
             kernel_size=conv_kernel_size,
             norm_type=conv_norm_type,
             conv_context_size=conv_context_size,
+            use_bias=use_bias,
         )
 
         # multi-headed self-attention module
@@ -107,6 +111,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'rel_pos_local_attn':
             self.self_attn = RelPositionMultiHeadAttentionLongformer(
@@ -120,10 +125,15 @@ def __init__(
                 global_tokens=global_tokens,
                 global_tokens_spacing=global_tokens_spacing,
                 global_attn_separate=global_attn_separate,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'abs_pos':
             self.self_attn = MultiHeadAttention(
-                n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len
+                n_head=n_heads,
+                n_feat=d_model,
+                dropout_rate=dropout_att,
+                max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         else:
             raise ValueError(
@@ -133,7 +143,7 @@ def __init__(
 
         # second feed forward module
         self.norm_feed_forward2 = LayerNorm(d_model)
-        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         self.dropout = nn.Dropout(dropout)
         self.norm_out = LayerNorm(d_model)
@@ -280,16 +290,25 @@ class ConformerConvolution(nn.Module):
         pointwise_activation (str): name of the activation function to be used for the pointwise conv.
             Note that Conformer uses a special key `glu_` which is treated as the original default from
             the paper.
+        use_bias (bool): Use bias in all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
+            Defaults to True
     """
 
     def __init__(
-        self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_'
+        self,
+        d_model,
+        kernel_size,
+        norm_type='batch_norm',
+        conv_context_size=None,
+        pointwise_activation='glu_',
+        use_bias=True,
     ):
         super(ConformerConvolution, self).__init__()
         assert (kernel_size - 1) % 2 == 0
         self.d_model = d_model
         self.kernel_size = kernel_size
         self.norm_type = norm_type
+        self.use_bias = use_bias
 
         if conv_context_size is None:
             conv_context_size = (kernel_size - 1) // 2
@@ -305,7 +324,12 @@ def __init__(
             dw_conv_input_dim = d_model
 
         self.pointwise_conv1 = nn.Conv1d(
-            in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=d_model,
+            out_channels=d_model * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
         self.depthwise_conv = CausalConv1D(
@@ -315,7 +339,7 @@ def __init__(
             stride=1,
             padding=conv_context_size,
             groups=dw_conv_input_dim,
-            bias=True,
+            bias=self.use_bias,
         )
 
         if norm_type == 'batch_norm':
@@ -334,7 +358,12 @@ def __init__(
 
         self.activation = Swish()
         self.pointwise_conv2 = nn.Conv1d(
-            in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=dw_conv_input_dim,
+            out_channels=d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
     def forward(self, x, pad_mask=None, cache=None):
@@ -348,7 +377,7 @@ def forward(self, x, pad_mask=None, cache=None):
             x = self.pointwise_activation(x)
 
         if pad_mask is not None:
-            x = x.float().masked_fill(pad_mask.unsqueeze(1), 0.0)
+            x = x.masked_fill(pad_mask.unsqueeze(1), 0.0)
 
         x = self.depthwise_conv(x, cache=cache)
         if cache is not None:
@@ -370,31 +399,34 @@ def forward(self, x, pad_mask=None, cache=None):
             return x, cache
 
     def reset_parameters_conv(self):
-        pw1_max = pw2_max = self.d_model ** -0.5
-        dw_max = self.kernel_size ** -0.5
+        pw1_max = pw2_max = self.d_model**-0.5
+        dw_max = self.kernel_size**-0.5
 
         with torch.no_grad():
             nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max)
-            nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
             nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max)
-            nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
             nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max)
-            nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
+            if self.use_bias:
+                nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
+                nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
+                nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
 
 
 class ConformerFeedForward(nn.Module):
     """
     feed-forward module of Conformer model.
+    use_bias (bool): Apply bias to all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
     """
 
-    def __init__(self, d_model, d_ff, dropout, activation=Swish()):
+    def __init__(self, d_model, d_ff, dropout, activation=Swish(), use_bias=True):
         super(ConformerFeedForward, self).__init__()
         self.d_model = d_model
         self.d_ff = d_ff
-        self.linear1 = nn.Linear(d_model, d_ff)
+        self.use_bias = use_bias
+        self.linear1 = nn.Linear(d_model, d_ff, bias=self.use_bias)
         self.activation = activation
         self.dropout = nn.Dropout(p=dropout)
-        self.linear2 = nn.Linear(d_ff, d_model)
+        self.linear2 = nn.Linear(d_ff, d_model, bias=self.use_bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -404,10 +436,11 @@ def forward(self, x):
         return x
 
     def reset_parameters_ff(self):
-        ffn1_max = self.d_model ** -0.5
-        ffn2_max = self.d_ff ** -0.5
+        ffn1_max = self.d_model**-0.5
+        ffn2_max = self.d_ff**-0.5
         with torch.no_grad():
             nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max)
-            nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
             nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max)
-            nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
+            if self.use_bias:
+                nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
+                nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py
index 67559eccf6e2..d2bfb629293e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py
@@ -213,20 +213,20 @@ def __init__(self, decoding_cfg, blank_id: int):
         self.batch_dim_index = self.cfg.get('batch_dim_index', 0)
         self.word_seperator = self.cfg.get('word_seperator', ' ')
 
-        possible_strategies = ['greedy', 'beam', 'pyctcdecode', 'flashlight']
+        possible_strategies = ['greedy', 'greedy_batch', 'beam', 'pyctcdecode', 'flashlight']
         if self.cfg.strategy not in possible_strategies:
             raise ValueError(f"Decoding strategy must be one of {possible_strategies}. Given {self.cfg.strategy}")
 
         # Update preserve alignments
         if self.preserve_alignments is None:
-            if self.cfg.strategy in ['greedy']:
+            if self.cfg.strategy in ['greedy', 'greedy_batch']:
                 self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False)
             else:
                 self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False)
 
         # Update compute timestamps
         if self.compute_timestamps is None:
-            if self.cfg.strategy in ['greedy']:
+            if self.cfg.strategy in ['greedy', 'greedy_batch']:
                 self.compute_timestamps = self.cfg.greedy.get('compute_timestamps', False)
             elif self.cfg.strategy in ['beam']:
                 self.compute_timestamps = self.cfg.beam.get('compute_timestamps', False)
@@ -234,10 +234,10 @@ def __init__(self, decoding_cfg, blank_id: int):
         # initialize confidence-related fields
         self._init_confidence(self.cfg.get('confidence_cfg', None))
 
-        # Confidence estimation is not implemented for strategies other than `greedy`
+        # Confidence estimation is not implemented for strategies other than `greedy` and `greedy_batch`
         if (
             not self.preserve_frame_confidence
-            and self.cfg.strategy != 'greedy'
+            and self.cfg.strategy not in ('greedy', 'greedy_batch')
             and self.cfg.beam.get('preserve_frame_confidence', False)
         ):
             raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`")
@@ -247,7 +247,6 @@ def __init__(self, decoding_cfg, blank_id: int):
             self.compute_timestamps |= self.preserve_frame_confidence
 
         if self.cfg.strategy == 'greedy':
-
             self.decoding = ctc_greedy_decoding.GreedyCTCInfer(
                 blank_id=self.blank_id,
                 preserve_alignments=self.preserve_alignments,
@@ -256,6 +255,15 @@ def __init__(self, decoding_cfg, blank_id: int):
                 confidence_method_cfg=self.confidence_method_cfg,
             )
 
+        elif self.cfg.strategy == "greedy_batch":
+            self.decoding = ctc_greedy_decoding.GreedyBatchedCTCInfer(
+                blank_id=self.blank_id,
+                preserve_alignments=self.preserve_alignments,
+                compute_timestamps=self.compute_timestamps,
+                preserve_frame_confidence=self.preserve_frame_confidence,
+                confidence_method_cfg=self.confidence_method_cfg,
+            )
+
         elif self.cfg.strategy == 'beam':
 
             self.decoding = ctc_beam_decoding.BeamCTCInfer(
@@ -1010,7 +1018,9 @@ class CTCDecoding(AbstractCTCDecoding):
     """
 
     def __init__(
-        self, decoding_cfg, vocabulary,
+        self,
+        decoding_cfg,
+        vocabulary,
     ):
         blank_id = len(vocabulary)
         self.vocabulary = vocabulary
@@ -1287,7 +1297,7 @@ def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]:
 
 @dataclass
 class CTCDecodingConfig:
-    strategy: str = "greedy"
+    strategy: str = "greedy_batch"
 
     # preserve decoding alignments
     preserve_alignments: Optional[bool] = None
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index ab4b4c40e860..74204cf73d8e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -22,10 +22,13 @@
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 
-def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]:
+def pack_hypotheses(
+    hypotheses: List[rnnt_utils.Hypothesis],
+    logitlen: torch.Tensor,
+) -> List[rnnt_utils.Hypothesis]:
 
     if logitlen is not None:
         if hasattr(logitlen, 'cpu'):
@@ -55,6 +58,9 @@ def _states_to_device(dec_state, device='cpu'):
     return dec_state
 
 
+_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]."
+
+
 class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
     """A greedy CTC decoder.
 
@@ -108,9 +114,8 @@ class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
-        # Input can be of dimention -
+        """Returns definitions of module input ports."""
+        # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
         return {
@@ -120,8 +125,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -145,7 +149,9 @@ def __init__(
 
     @typecheck()
     def forward(
-        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -158,6 +164,15 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        logging.warning(
+            "CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.",
+            mode=logging_mode.ONCE,
+        )
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+
         with torch.inference_mode():
             hypotheses = []
             # Process each sequence independently
@@ -204,7 +219,7 @@ def forward(
         return (packed_result,)
 
     @torch.no_grad()
-    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T, D]
         # out_len: [seq_len]
 
@@ -234,7 +249,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
         return hypothesis
 
     @torch.no_grad()
-    def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T]
         # out_len: [seq_len]
 
@@ -266,6 +281,244 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
+class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
+    """A vectorized greedy CTC decoder.
+
+    This is basically always faster than GreedyCTCInfer, and supports
+    the same interface. See issue #8891 on github for what is wrong
+    with GreedyCTCInfer. GreedyCTCInfer loops over each element in the
+    batch, running kernels at batch size one. CPU overheads end up
+    dominating. This implementation does appropriate masking to
+    appropriately do the same operation in a batched manner.
+
+    Args:
+        blank_index: int index of the blank token. Can be 0 or len(vocabulary).
+        preserve_alignments: Bool flag which preserves the history of logprobs generated during
+            decoding (sample / batched). When set to true, the Hypothesis will contain
+            the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors.
+        compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
+                word based timestamp mapping the output log-probabilities to discrite intervals of timestamps.
+                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.
+        preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
+            generated during decoding. When set to true, the Hypothesis will contain
+            the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats.
+        confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame
+            confidence scores.
+
+            name: The method name (str).
+                Supported values:
+                    - 'max_prob' for using the maximum token probability as a confidence.
+                    - 'entropy' for using a normalized entropy of a log-likelihood vector.
+
+            entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`.
+                Supported values:
+                    - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
+                        the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
+                        Note that for this entropy, the alpha should comply the following inequality:
+                        (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
+                        where V is the model vocabulary size.
+                    - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
+                        Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
+                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
+                        More: https://en.wikipedia.org/wiki/Tsallis_entropy
+                    - 'renyi' for the Rényi entropy.
+                        Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
+                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
+                        More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy
+
+            alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
+                When the alpha equals one, scaling is not applied to 'max_prob',
+                and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))
+
+            entropy_norm: A mapping of the entropy value to the interval [0,1].
+                Supported values:
+                    - 'lin' for using the linear mapping.
+                    - 'exp' for using exponential mapping with linear shift.
+
+    """
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        # Input can be of dimension -
+        # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
+
+        return {
+            "decoder_output": NeuralType(None, LogprobsType()),
+            "decoder_lengths": NeuralType(tuple('B'), LengthsType()),
+        }
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return {"predictions": [NeuralType(elements_type=HypothesisType())]}
+
+    def __init__(
+        self,
+        blank_id: int,
+        preserve_alignments: bool = False,
+        compute_timestamps: bool = False,
+        preserve_frame_confidence: bool = False,
+        confidence_method_cfg: Optional[DictConfig] = None,
+    ):
+        super().__init__()
+
+        self.blank_id = blank_id
+        self.preserve_alignments = preserve_alignments
+        # we need timestamps to extract non-blank per-frame confidence
+        self.compute_timestamps = compute_timestamps | preserve_frame_confidence
+        self.preserve_frame_confidence = preserve_frame_confidence
+
+        # set confidence calculation method
+        self._init_confidence_method(confidence_method_cfg)
+
+    @typecheck()
+    def forward(
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
+    ):
+        """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
+        Output token is generated auto-repressively.
+
+        Args:
+            decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label).
+            decoder_lengths: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            packed list containing batch number of sentences (Hypotheses).
+        """
+
+        input_decoder_lengths = decoder_lengths
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+            decoder_lengths = torch.tensor(
+                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
+            ).expand(decoder_output.shape[0])
+
+        # GreedyCTCInfer::forward(), by accident, works with
+        # decoder_lengths on either CPU or GPU when decoder_output is
+        # on GPU. For the sake of backwards compatibility, we also
+        # allow decoder_lengths to be on the CPU device. In this case,
+        # we simply copy the decoder_lengths from CPU to GPU. If both
+        # tensors are already on the same device, this is a no-op.
+        decoder_lengths = decoder_lengths.to(decoder_output.device)
+
+        if decoder_output.ndim == 2:
+            hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
+        else:
+            hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
+        packed_result = pack_hypotheses(hypotheses, input_decoder_lengths)
+        return (packed_result,)
+
+    @torch.no_grad()
+    def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor):
+        # x: [B, T, D]
+        # out_len: [B]
+
+        batch_size = x.shape[0]
+        max_time = x.shape[1]
+
+        predictions = x
+        # In CTC greedy decoding, each output maximum likelihood token
+        # is calculated independent of the other tokens.
+        predictions_logprobs, predictions_labels = predictions.max(dim=-1)
+
+        # Since predictions_logprobs is a padded matrix in the time
+        # dimension, we consider invalid timesteps to be "blank".
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
+        # Sum the non-blank labels to compute the score of the
+        # transcription. This follows from Eq. (3) of "Connectionist
+        # Temporal Classification: Labelling Unsegmented Sequence Data
+        # with Recurrent Neural Networks".
+        scores = torch.where(non_blank_ids_mask, predictions_logprobs, 0.0).sum(axis=1)
+
+        scores = scores.cpu()
+        predictions_labels = predictions_labels.cpu()
+        out_len = out_len.cpu()
+
+        if self.preserve_alignments or self.preserve_frame_confidence:
+            predictions = predictions.cpu()
+
+        hypotheses = []
+
+        # This mimics the for loop in GreedyCTCInfer::forward.
+        for i in range(batch_size):
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis.score = scores[i]
+
+            prediction_labels_no_padding = predictions_labels[i, : out_len[i]].tolist()
+
+            assert predictions_labels.dtype == torch.int64
+            hypothesis.y_sequence = prediction_labels_no_padding
+
+            if self.preserve_alignments:
+                hypothesis.alignments = (
+                    predictions[i, : out_len[i], :].clone(),
+                    predictions_labels[i, : out_len[i]].clone(),
+                )
+            if self.compute_timestamps:
+                # TOOD: Could do this in a vectorized manner... Would
+                # prefer to have nonzero_static, though, for sanity.
+                # Or do a prefix sum on out_len
+                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+            if self.preserve_frame_confidence:
+                hypothesis.frame_confidence = self._get_confidence(predictions[i, : out_len[i], :])
+
+            hypotheses.append(hypothesis)
+
+        return hypotheses
+
+    @torch.no_grad()
+    def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor):
+        """
+        This does greedy decoding in the case where you have already found the
+        most likely token at each timestep.
+        """
+        # x: [B, T]
+        # out_len: [B]
+
+        batch_size = x.shape[0]
+        max_time = x.shape[1]
+
+        predictions_labels = x
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
+        predictions_labels = predictions_labels.cpu()
+        out_len = out_len.cpu()
+
+        hypotheses = []
+
+        for i in range(batch_size):
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis.y_sequence = predictions_labels[i, : out_len[i]].tolist()
+            hypothesis.score = -1.0
+
+            if self.preserve_alignments:
+                raise ValueError(
+                    "Requested for alignments, but predictions provided were labels, not log probabilities."
+                )
+            if self.compute_timestamps:
+                # TOOD: Could do this in a vectorized manner... Would
+                # prefer to have nonzero_static, though, for sanity.
+                # Or do a prefix sum on out_len
+                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+            if self.preserve_frame_confidence:
+                raise ValueError(
+                    "Requested for per-frame confidence, but predictions provided were labels, not log probabilities."
+                )
+
+            hypotheses.append(hypothesis)
+
+        return hypotheses
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
 @dataclass
 class GreedyCTCInferConfig:
     preserve_alignments: bool = False
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index 388737443fd4..aa49435ded16 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -37,7 +37,7 @@
 
 def create_outer_for_loop_kernel():
     """
-    Creates a kernel that evaluates whether or not to enter the for loop body. 
+    Creates a kernel that evaluates whether or not to enter the for loop body.
     Effectively substitutes for `for time_idx in range(trip_count)`
     such that that for loop can run on a GPU.
     """
@@ -171,8 +171,10 @@ def _reinitialize(self, max_time, batch_size, encoder_output, encoder_output_len
 
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.device)
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             # This is failing...
             self.f = torch.zeros(
@@ -292,14 +294,21 @@ def __call__(
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
         if partial_hypotheses is not None:
-            raise NotImplementedError("`partial_hypotheses` support is not available with cuda graphs (but could be)")
+            raise NotImplementedError(
+                "`partial_hypotheses` support is not available "
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
+            )
 
         if self.caller.preserve_alignments:
-            raise NotImplementedError("`preserve_alignments` support is not available with cuda graphs (but could be)")
+            raise NotImplementedError(
+                "`preserve_alignments` support is not available"
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
+            )
 
         if self.caller.preserve_frame_confidence:
             raise NotImplementedError(
-                "`preserve_frame_confidence` support is not available with cuda graphs (but could be)"
+                "`preserve_frame_confidence` support is not available"
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
             )
 
         batch_size = x.shape[0]
diff --git a/nemo/collections/asr/parts/submodules/diffusion.py b/nemo/collections/asr/parts/submodules/diffusion.py
new file mode 100644
index 000000000000..db3d30f49701
--- /dev/null
+++ b/nemo/collections/asr/parts/submodules/diffusion.py
@@ -0,0 +1,1310 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Sequence, Tuple, Type
+
+import einops
+import einops.layers.torch
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from nemo.collections.common.parts.utils import activation_registry
+from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.core.classes import NeuralModule, typecheck
+from nemo.core.neural_types import FloatType, LengthsType, NeuralType, SpectrogramType, VoidType
+from nemo.utils import logging
+
+__all__ = [
+    'OrnsteinUhlenbeckVarianceExplodingSDE',
+    'SpectrogramNoiseConditionalScoreNetworkPlusPlus',
+    'NoiseConditionalScoreNetworkPlusPlus',
+    'PredictorCorrectorSampler',
+]
+
+
+class StochasticDifferentialEquation(NeuralModule, ABC):
+    """Base class for stochastic differential equations.
+    """
+
+    def __init__(self, time_min: float, time_max: float, num_steps: int):
+        super().__init__()
+
+        # min and max time
+        if time_min <= 0:
+            raise ValueError(f'time_min should be positive, current value {time_min}')
+
+        if time_max <= time_min:
+            raise ValueError(f'time_max should be larger than time_min, current max {time_max} and min {time_min}')
+
+        self.time_min = time_min
+        self.time_max = time_max
+
+        # number of steps
+        if num_steps <= 0:
+            raise ValueError(f'num_steps needs to be positive: current value {num_steps}')
+
+        self.num_steps = num_steps
+
+    @property
+    def dt(self) -> float:
+        """Time step for this SDE.
+        This denotes the step size between `0` and `self.time_max` when using `self.num_steps`.
+        """
+        return self.time_max / self.num_steps
+
+    @property
+    def time_delta(self) -> float:
+        """Time range for this SDE.
+        """
+        return self.time_max - self.time_min
+
+    def generate_time(self, size: int, device: torch.device) -> torch.Tensor:
+        """Generate random time steps in the valid range.
+
+        Time steps are generated between `self.time_min` and `self.time_max`.
+
+        Args:
+            size: number of samples
+            device: device to use
+
+        Returns:
+            A tensor of floats with shape (size,)
+        """
+        time = torch.rand(size, device=device) * self.time_delta + self.time_min
+        return time
+
+    @abstractmethod
+    def coefficients(self, state: torch.Tensor, time: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            state: tensor of shape (B, C, D, T)
+            time: tensor of shape (B,)
+
+        Returns:
+            Tuple with drift and diffusion coefficients.
+        """
+        pass
+
+    @typecheck(
+        input_types={"prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+        output_types={"sample": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+    )
+    @abstractmethod
+    def prior_sampling(self, prior_mean: torch.Tensor) -> torch.Tensor:
+        """Generate a sample from the prior distribution p_T.
+
+        Args:
+            prior_mean: Mean of the prior distribution
+
+        Returns:
+            A sample from the prior distribution.
+        """
+        pass
+
+    def discretize(
+        self, *, state: torch.Tensor, time: torch.Tensor, state_length: Optional[torch.Tensor] = None, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Assume we have the following SDE:
+
+            dx = drift(x, t) * dt + diffusion(x, t) * dwt
+
+        where `wt` is the standard Wiener process.
+
+        We assume the following discretization:
+
+            new_state = current_state + total_drift + total_diffusion * z_norm
+
+        where `z_norm` is sampled from normal distribution with zero mean and unit variance.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            state_length: length of the valid time steps for each example in the batch, shape (B,)
+            **kwargs: other parameters
+
+        Returns:
+            Drift and diffusion.
+        """
+        # Get coefficients
+        drift_coefficient, diffusion_coefficient = self.coefficients(
+            state=state, time=time, state_length=state_length, **kwargs
+        )
+
+        # Discretized drift
+        drift = drift_coefficient * self.dt
+
+        # Note:
+        # Scale with sqrt(dt) because z_norm is sampled from a normal distribution with zero mean and
+        # unit variance and dwt is normally distributed with zero mean and variance dt
+        diffusion = diffusion_coefficient * np.sqrt(self.dt)
+
+        return drift, diffusion
+
+    @abstractmethod
+    def copy(self):
+        """Create a copy of this SDE.
+        """
+        pass
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(time_min={self.time_min}, time_max={self.time_max}, num_steps={self.num_steps})'
+        desc += f'\n\tdt:         {self.dt}'
+        desc += f'\n\ttime_delta: {self.time_delta}'
+        return desc
+
+
+class OrnsteinUhlenbeckVarianceExplodingSDE(StochasticDifferentialEquation):
+    """This class implements the Ornstein-Uhlenbeck SDE with variance exploding noise schedule.
+
+    The SDE is given by:
+
+        dx = theta * (y - x) dt + g(t) dw
+
+    where `theta` is the stiffness parameter and `g(t)` is the diffusion coefficient:
+
+        g(t) = std_min * (std_max/std_min)^t * sqrt(2 * log(std_max/std_min))
+
+    References:
+        Richter et al., Speech Enhancement and Dereverberation with Diffusion-based Generative Models, Tr. ASLP 2023
+    """
+
+    def __init__(
+        self,
+        stiffness: float,
+        std_min: float,
+        std_max: float,
+        num_steps: int = 100,
+        time_min: float = 3e-2,
+        time_max: float = 1.0,
+        eps: float = 1e-8,
+    ):
+        super().__init__(time_min=time_min, time_max=time_max, num_steps=num_steps)
+
+        # Small regularization
+        if eps <= 0:
+            raise ValueError(f'eps should be positive, current value {eps}')
+        self.eps = eps
+
+        # stifness
+        self.stiffness = stiffness
+
+        # noise schedule
+        if std_min <= 0:
+            raise ValueError(f'std_min should be positive, current value {std_min}')
+
+        if std_max <= std_min:
+            raise ValueError(f'std_max should be larger than std_min, current max {std_max} and min {std_min}')
+
+        self.std_min = std_min
+        self.std_max = std_max
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tstiffness:     %s', self.stiffness)
+        logging.debug('\tstd_min:       %s', self.std_min)
+        logging.debug('\tstd_max:       %s', self.std_max)
+        logging.debug('\tnum_steps:     %s', self.num_steps)
+        logging.debug('\ttime_min:      %s', self.time_min)
+        logging.debug('\ttime_max:      %s', self.time_max)
+        logging.debug('\teps:           %s', self.eps)
+
+    @property
+    def std_ratio(self) -> float:
+        return self.std_max / (self.std_min + self.eps)
+
+    @property
+    def log_std_ratio(self) -> float:
+        return np.log(self.std_ratio + self.eps)
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+        },
+        output_types={"mean": NeuralType(('B', 'C', 'D', 'T'), FloatType()),},
+    )
+    def perturb_kernel_mean(self, state: torch.Tensor, prior_mean: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        """Return the mean of the perturbation kernel for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            prior_mean: mean of the prior distribution
+            time: current time of the process, shape (B,)
+
+        Returns:
+            A tensor of shape (B, C, D, T)
+        """
+        # exponential weighting
+        weight = torch.exp(-self.stiffness * time)
+
+        # view as [B, C, D, T]
+        weight = weight.view(-1, 1, 1, 1)
+
+        # closed-form mean
+        mean = weight * state + (1 - weight) * prior_mean
+
+        return mean
+
+    @typecheck(
+        input_types={"time": NeuralType(tuple('B'), FloatType()),},
+        output_types={"std": NeuralType(tuple('B'), FloatType()),},
+    )
+    def perturb_kernel_std(self, time: torch.Tensor) -> torch.Tensor:
+        """Return the standard deviation of the perturbation kernel for this SDE.
+
+        Note that the standard deviation depends on the time and the noise schedule,
+        which is parametrized using `self.stiffness`, `self.std_min` and `self.std_max`.
+
+        Args:
+            time: current time of the process, shape (B,)
+
+        Returns:
+            A tensor of shape (B,)
+        """
+        var = (self.std_min ** 2) * self.log_std_ratio
+        var *= torch.pow(self.std_ratio, 2 * time) - torch.exp(-2 * self.stiffness * time)
+        var /= self.stiffness + self.log_std_ratio
+        std = torch.sqrt(var)
+        return std
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+        },
+        output_types={
+            "mean": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+            "std": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+        },
+    )
+    def perturb_kernel_params(self, state: torch.Tensor, prior_mean: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        """Return the mean and standard deviation of the perturbation kernel for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            prior_mean: mean of the prior distribution
+            time: current time of the process, shape (B,)
+        """
+        assert torch.all(time <= self.time_max)
+        assert torch.all(time >= self.time_min)
+
+        # compute the mean
+        mean = self.perturb_kernel_mean(state=state, prior_mean=prior_mean, time=time)
+
+        # compute the standard deviation
+        std = self.perturb_kernel_std(time=time)
+        # view as [B, C, D, T]
+        std = std.view(-1, 1, 1, 1)
+
+        return mean, std
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={
+            "drift_coefficient": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+            "diffusion_coefficient": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+        },
+    )
+    def coefficients(
+        self,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        prior_mean: torch.Tensor,
+        state_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute drift and diffusion coefficients for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            prior_mean: mean of the prior distribution
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            Drift and diffusion coefficients.
+        """
+        # Drift coefficient
+        drift_coefficient = self.stiffness * (prior_mean - state)
+
+        # Diffusion coefficient
+        diffusion_coefficient = self.std_min * torch.pow(self.std_ratio, time) * np.sqrt(2 * self.log_std_ratio)
+        # View in the same shape as the state
+        diffusion_coefficient = diffusion_coefficient.view(-1, *([1] * (state.dim() - 1)))
+
+        if state_length is not None:
+            drift_coefficient = mask_sequence_tensor(drift_coefficient, state_length)
+            diffusion_coefficient = mask_sequence_tensor(diffusion_coefficient, state_length)
+
+        return drift_coefficient, diffusion_coefficient
+
+    def prior_sampling(self, prior_mean: torch.Tensor) -> torch.Tensor:
+        """Generate a sample from the prior distribution p_T.
+
+        Args:
+            prior_mean: Mean of the prior distribution
+        """
+        # Final time step for all samples in the batch
+        time = self.time_max * torch.ones(prior_mean.shape[0], device=prior_mean.device)
+
+        # Compute the std of the prior distribution
+        std = self.perturb_kernel_std(time=time)
+
+        # view as [B, C, D, T]
+        std = std.view(-1, 1, 1, 1)
+
+        # Generate a sample from a normal distribution centered at prior_mean
+        sample = prior_mean + torch.randn_like(prior_mean) * std
+
+        return sample
+
+    def copy(self):
+        return OrnsteinUhlenbeckVarianceExplodingSDE(
+            stiffness=self.stiffness,
+            std_min=self.std_min,
+            std_max=self.std_max,
+            num_steps=self.num_steps,
+            time_min=self.time_min,
+            time_max=self.time_max,
+            eps=self.eps,
+        )
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(stiffness={self.stiffness}, std_min={self.std_min}, std_max={self.std_max}, num_steps={self.num_steps}, time_min={self.time_min}, time_max={self.time_max}, eps={self.eps})'
+        desc += f'\n\tdt:         {self.dt}'
+        desc += f'\n\ttime_delta: {self.time_delta}'
+        desc += f'\n\tstd_ratio:  {self.std_ratio}'
+        desc += f'\n\tlog_std_ratio:  {self.log_std_ratio}'
+
+        return desc
+
+
+class ReverseStochasticDifferentialEquation(StochasticDifferentialEquation):
+    def __init__(self, *, sde: Type[StochasticDifferentialEquation], score_estimator: Type[NeuralModule]):
+        """Use the forward SDE and a score estimator to define the reverse SDE.
+
+        Args:
+            sde: forward SDE
+            score_estimator: neural score estimator
+        """
+        super().__init__(time_min=sde.time_min, time_max=sde.time_max, num_steps=sde.num_steps)
+        self.score_estimator = score_estimator
+        self.forward_sde = sde
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+
+    def coefficients(
+        self,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute drift and diffusion coefficients for the reverse SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+        """
+        raise NotImplementedError('Coefficients not necessary for the reverse SDE.')
+
+    def prior_sampling(self, shape: torch.Size, device: torch.device) -> torch.Tensor:
+        """Prior sampling is not necessary for the reverse SDE.
+        """
+        raise NotImplementedError('Prior sampling not necessary for the reverse SDE.')
+
+    def discretize(
+        self,
+        *,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Discretize the reverse SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: condition for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+            **kwargs: other parameters for discretization of the forward SDE
+        """
+        # Drift and diffusion from the forward SDE
+        forward_drift, forward_diffusion = self.forward_sde.discretize(state=state, time=time, **kwargs)
+
+        # For input for the score estimator:
+        # - if no condition is provided, use the state
+        # - if a condition is provided, concatenate the state and the condition along the channel dimension
+        score_input = state if score_condition is None else torch.cat([state, score_condition], dim=1)
+
+        # Estimate score
+        score, _ = self.score_estimator(input=score_input, input_length=state_length, condition=time)
+
+        # Adjust drift
+        drift = forward_drift - forward_diffusion.pow(2) * score
+
+        # Adjust diffusion
+        diffusion = forward_diffusion
+
+        if state_length is not None:
+            drift = mask_sequence_tensor(drift, state_length)
+            diffusion = mask_sequence_tensor(diffusion, state_length)
+
+        return drift, diffusion
+
+    def copy(self):
+        return ReverseStochasticDifferentialEquation(sde=self.forward_sde.copy(), score_estimator=self.score_estimator)
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(sde={self.forward_sde}, score_estimator={self.score_estimator})'
+        return desc
+
+
+class SpectrogramNoiseConditionalScoreNetworkPlusPlus(NeuralModule):
+    """This model handles complex-valued inputs by stacking real and imaginary components.
+    Stacked tensor is processed using NCSN++ and the output is projected to generate real
+    and imaginary components of the output channels.
+
+    Args:
+        in_channels: number of input complex-valued channels
+        out_channels: number of output complex-valued channels
+    """
+
+    def __init__(self, *, in_channels: int = 1, out_channels: int = 1, **kwargs):
+        super().__init__()
+
+        # Number of input signals for this estimator
+        if in_channels < 1:
+            raise ValueError(
+                f'Number of input channels needs to be larger or equal to one, current value {in_channels}'
+            )
+
+        self.in_channels = in_channels
+
+        # Number of output signals for this estimator
+        if out_channels < 1:
+            raise ValueError(
+                f'Number of output channels needs to be larger or equal to one, current value {out_channels}'
+            )
+
+        self.out_channels = out_channels
+
+        # Instantiate noise conditional score network NCSN++
+        ncsnpp_params = kwargs.copy()
+        ncsnpp_params['in_channels'] = ncsnpp_params['out_channels'] = 2 * self.in_channels  # stack real and imag
+        self.ncsnpp = NoiseConditionalScoreNetworkPlusPlus(**ncsnpp_params)
+
+        # Output projection to generate real and imaginary components of the output channels
+        self.output_projection = torch.nn.Conv2d(
+            in_channels=2 * self.in_channels, out_channels=2 * self.out_channels, kernel_size=1
+        )
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tin_channels:  %s', self.in_channels)
+        logging.debug('\tout_channels: %s', self.out_channels)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "input_length": NeuralType(('B',), LengthsType(), optional=True),
+            "condition": NeuralType(('B',), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "output_length": NeuralType(('B',), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(self, input, input_length=None, condition=None):
+        # Stack real and imaginary components
+        B, C_in, D, T = input.shape
+
+        if C_in != self.in_channels:
+            raise RuntimeError(f'Unexpected input channel size {C_in}, expected {self.in_channels}')
+
+        # Stack real and imaginary parts
+        input_real_imag = torch.stack([input.real, input.imag], dim=2)
+        input = einops.rearrange(input_real_imag, 'B C RI F T -> B (C RI) F T')
+
+        # Process using NCSN++
+        output, output_length = self.ncsnpp(input=input, input_length=input_length, condition=condition)
+
+        # Output projection
+        output = self.output_projection(output)
+
+        # Convert to complex-valued signal
+        output = output.reshape(B, 2, self.out_channels, D, T)
+        # Move real/imag dimension to the end
+        output = output.permute(0, 2, 3, 4, 1)
+        output = torch.view_as_complex(output.contiguous())
+
+        return output, output_length
+
+
+class NoiseConditionalScoreNetworkPlusPlus(NeuralModule):
+    """Implementation of Noise Conditional Score Network (NCSN++) architecture.
+
+    References:
+        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
+        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
+    """
+
+    def __init__(
+        self,
+        nonlinearity: str = "swish",
+        in_channels: int = 2,  # number of channels in the input image
+        out_channels: int = 2,  # number of channels in the output image
+        channels: Sequence[int] = (128, 128, 256, 256, 256),  # number of channels at start + at every resolution
+        num_res_blocks: int = 2,
+        num_resolutions: int = 4,
+        init_scale: float = 1e-5,
+        conditioned_on_time: bool = False,
+        fourier_embedding_scale: float = 16.0,
+        dropout_rate: float = 0.0,
+        pad_time_to: Optional[int] = None,
+        pad_dimension_to: Optional[int] = None,
+        **_,
+    ):
+        # Network topology is a flavor of UNet, example chart for num_resolutions=4
+        #
+        # 1: Image  → Image/2  → Image/4  → Image/8
+        #       ↓        ↓          ↓          ↓
+        # 2: Hidden → Hidden/2 → Hidden/4 → Hidden/8
+        #       ↓        ↓          ↓          ↓
+        # 3: Hidden ← Hidden/2 ← Hidden/4 ← Hidden/8
+        #       ↓        ↓          ↓          ↓
+        # 4: Image  ← Image/2  ← Image/4  ← Image/8
+
+        # Horizontal arrows in (1) are downsampling
+        # Vertical arrows from (1) to (2) are channel upconversions
+        #
+        # Horizontal arrows in (2) are blocks with downsampling where necessary
+        # Horizontal arrows in (3) are blocks with upsampling where necessary
+        #
+        # Vertical arrows from (1) to (2) are downsampling and channel upconversioins
+        # Vertical arrows from (2) to (3) are sums connections (also with / sqrt(2))
+        # Vertical arrows from (3) to (4) are channel downconversions
+        # Horizontal arrows in (4) are upsampling and addition
+        super().__init__()
+
+        # same nonlinearity is used throughout the whole network
+        self.activation: torch.nn.Module = activation_registry[nonlinearity]()
+        self.init_scale: float = init_scale
+
+        self.downsample = torch.nn.Upsample(scale_factor=0.5, mode="bilinear")
+        self.upsample = torch.nn.Upsample(scale_factor=2, mode="bilinear")
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.num_res_blocks = num_res_blocks
+        self.num_resolutions = num_resolutions
+        self.conditioned_on_time = conditioned_on_time
+
+        # padding setup
+        self.pad_time_to = pad_time_to or 2 ** self.num_resolutions
+        self.pad_dimension_to = pad_dimension_to or 2 ** self.num_resolutions
+
+        if self.conditioned_on_time:
+            self.time_embedding = torch.nn.Sequential(
+                GaussianFourierProjection(embedding_size=self.channels[0], scale=fourier_embedding_scale),
+                torch.nn.Linear(self.channels[0] * 2, self.channels[0] * 4),
+                self.activation,
+                torch.nn.Linear(self.channels[0] * 4, self.channels[0] * 4),
+            )
+
+        self.input_pyramid = torch.nn.ModuleList()
+        for ch in self.channels[:-1]:
+            self.input_pyramid.append(torch.nn.Conv2d(in_channels=self.in_channels, out_channels=ch, kernel_size=1))
+
+        # each block takes an image and outputs an image
+        # possibly changes number of channels
+        # output blocks ("reverse" path of the unet) reuse outputs of input blocks ("forward" path)
+        # so great care must be taken to in/out channels of each block
+        # resolutions are handled in `forward`
+        block_params = {
+            "activation": self.activation,
+            "dropout_rate": dropout_rate,
+            "init_scale": self.init_scale,
+            "diffusion_step_embedding_dim": channels[0] * 4 if self.conditioned_on_time else None,
+        }
+        self.input_blocks = torch.nn.ModuleList()
+        for in_ch, out_ch in zip(self.channels[:-1], self.channels[1:]):
+            for n in range(num_res_blocks):
+                block = ResnetBlockBigGANPlusPlus(in_ch=in_ch if n == 0 else out_ch, out_ch=out_ch, **block_params)
+                self.input_blocks.append(block)
+
+        self.output_blocks = torch.nn.ModuleList()
+        for in_ch, out_ch in zip(reversed(self.channels[1:]), reversed(self.channels[:-1])):
+            for n in reversed(range(num_res_blocks)):
+                block = ResnetBlockBigGANPlusPlus(in_ch=in_ch, out_ch=out_ch if n == 0 else in_ch, **block_params)
+                self.output_blocks.append(block)
+
+        self.projection_blocks = torch.nn.ModuleList()
+        for ch in self.channels[:-1]:
+            self.projection_blocks.append(torch.nn.Conv2d(ch, out_channels, kernel_size=1))
+
+        assert len(self.input_pyramid) == self.num_resolutions
+        assert len(self.input_blocks) == self.num_resolutions * self.num_res_blocks
+        assert len(self.output_blocks) == self.num_resolutions * self.num_res_blocks
+        assert len(self.projection_blocks) == self.num_resolutions
+
+        self.init_weights_()
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tin_channels:         %s', self.in_channels)
+        logging.debug('\tout_channels:        %s', self.out_channels)
+        logging.debug('\tchannels:            %s', self.channels)
+        logging.debug('\tnum_res_blocks:      %s', self.num_res_blocks)
+        logging.debug('\tnum_resolutions:     %s', self.num_resolutions)
+        logging.debug('\tconditioned_on_time: %s', self.conditioned_on_time)
+        logging.debug('\tpad_time_to:         %s', self.pad_time_to)
+        logging.debug('\tpad_dimension_to:    %s', self.pad_dimension_to)
+
+    def init_weights_(self):
+        for module in self.modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+
+        # torch.nn submodules with scaled init
+        for module in self.projection_blocks:
+            torch.nn.init.xavier_uniform_(module.weight, gain=self.init_scale)
+
+        # non-torch.nn submodules can have their own init schemes
+        for module in self.modules():
+            if module is self:
+                continue
+
+            if hasattr(module, "init_weights_"):
+                module.init_weights_()
+
+    @typecheck(
+        input_types={"input": NeuralType(('B', 'C', 'D', 'T')),},
+        output_types={"output": NeuralType(('B', 'C', 'D', 'T')),},
+    )
+    def pad_input(self, input: torch.Tensor) -> torch.Tensor:
+        """Pad input tensor to match the required dimensions across `T` and `D`.
+        """
+        *_, D, T = input.shape
+        output = input
+
+        # padding across time
+        if T % self.pad_time_to != 0:
+            output = F.pad(output, (0, self.pad_time_to - T % self.pad_time_to))
+
+        # padding across dimension
+        if D % self.pad_dimension_to != 0:
+            output = F.pad(output, (0, 0, 0, self.pad_dimension_to - D % self.pad_dimension_to))
+
+        return output
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "input_length": NeuralType(('B',), LengthsType(), optional=True),
+            "condition": NeuralType(('B',), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "output_length": NeuralType(('B',), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(
+        self, *, input: torch.Tensor, input_length: Optional[torch.Tensor], condition: Optional[torch.Tensor] = None
+    ):
+        """Forward pass of the model.
+
+        Args:
+            input: input tensor, shjae (B, C, D, T)
+            input_length: length of the valid time steps for each example in the batch, shape (B,)
+            condition: scalar condition (time) for the model, will be embedded using `self.time_embedding`
+        """
+        assert input.shape[1] == self.in_channels
+
+        # apply padding at the input
+        *_, D, T = input.shape
+        input = self.pad_input(input=input)
+
+        if input_length is None:
+            # assume all time frames are valid
+            input_length = torch.LongTensor([input.shape[-1]] * input.shape[0]).to(input.device)
+
+        lengths = input_length
+
+        if condition is not None:
+            if len(condition.shape) != 1:
+                raise ValueError(
+                    f"Expected conditon to be a 1-dim tensor, got a {len(condition.shape)}-dim tensor of shape {tuple(condition.shape)}"
+                )
+            if condition.shape[0] != input.shape[0]:
+                raise ValueError(
+                    f"Condition {tuple(condition.shape)} and input {tuple(input.shape)} should match along the batch dimension"
+                )
+
+            condition = self.time_embedding(torch.log(condition))
+
+        # downsample and project input image to add later in the downsampling path
+        pyramid = [input]
+        for resolution_num in range(self.num_resolutions - 1):
+            pyramid.append(self.downsample(pyramid[-1]))
+        pyramid = [block(image) for image, block in zip(pyramid, self.input_pyramid)]
+
+        # downsampling path
+        history = []
+        hidden = torch.zeros_like(pyramid[0])
+        input_blocks = iter(self.input_blocks)
+        for resolution_num, image in enumerate(pyramid):
+            hidden = (hidden + image) / math.sqrt(2.0)
+            hidden = mask_sequence_tensor(hidden, lengths)
+
+            for _ in range(self.num_res_blocks):
+                hidden = next(input_blocks)(hidden, condition)
+                hidden = mask_sequence_tensor(hidden, lengths)
+                history.append(hidden)
+
+            final_resolution = resolution_num == self.num_resolutions - 1
+            if not final_resolution:
+                hidden = self.downsample(hidden)
+                lengths = (lengths / 2).ceil().long()
+
+        # upsampling path
+        to_project = []
+        for residual, block in zip(reversed(history), self.output_blocks):
+            if hidden.shape != residual.shape:
+                to_project.append(hidden)
+                hidden = self.upsample(hidden)
+                lengths = (lengths * 2).long()
+
+            hidden = (hidden + residual) / math.sqrt(2.0)
+            hidden = block(hidden, condition)
+            hidden = mask_sequence_tensor(hidden, lengths)
+
+        to_project.append(hidden)
+
+        # projecting to images
+        images = []
+        for tensor, projection in zip(to_project, reversed(self.projection_blocks)):
+            image = projection(tensor)
+            images.append(F.interpolate(image, size=input.shape[-2:]))  # TODO write this loop using self.upsample
+
+        result = sum(images)
+
+        assert result.shape[-2:] == input.shape[-2:]
+
+        # remove padding
+        result = result[:, :, :D, :T]
+        return result, input_length
+
+
+class GaussianFourierProjection(NeuralModule):
+    """Gaussian Fourier embeddings for input scalars.
+    
+    The input scalars are typically time or noise levels.
+    """
+
+    def __init__(self, embedding_size: int = 256, scale: float = 1.0):
+        super().__init__()
+        self.W = torch.nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B',), FloatType()),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'D'), VoidType()),
+        }
+
+    def forward(self, input):
+        x_proj = input[:, None] * self.W[None, :] * 2 * math.pi
+        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+
+
+class ResnetBlockBigGANPlusPlus(torch.nn.Module):
+    """Implementation of a ResNet block for the BigGAN model.
+
+    References:
+        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
+        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
+    """
+
+    def __init__(
+        self,
+        activation: torch.nn.Module,
+        in_ch: int,
+        out_ch: int,
+        diffusion_step_embedding_dim: Optional[int] = None,
+        init_scale: float = 1e-5,
+        dropout_rate: float = 0.1,
+        in_num_groups: Optional[int] = None,
+        out_num_groups: Optional[int] = None,
+        eps: float = 1e-6,
+    ):
+        """
+        Args:
+            activation (torch.nn.Module): activation layer (ReLU, SiLU, etc)
+            in_ch (int): number of channels in the input image
+            out_ch (int, optional): number of channels in the output image
+            diffusion_step_embedding_dim (int, optional): dimension of diffusion timestep embedding. Defaults to None (no embedding).
+            dropout_rate (float, optional): dropout rate. Defaults to 0.1.
+            init_scale (float, optional): scaling for weight initialization. Defaults to 0.0.
+            in_num_groups (int, optional): num_groups in the first GroupNorm. Defaults to min(in_ch // 4, 32)
+            out_num_groups (int, optional): num_groups in the second GroupNorm. Defaults to min(out_ch // 4, 32)
+            eps (float, optional): eps parameter of GroupNorms. Defaults to 1e-6.
+        """
+        super().__init__()
+        in_num_groups = in_num_groups or min(in_ch // 4, 32)
+        out_num_groups = out_num_groups or min(out_ch // 4, 32)
+
+        self.init_scale = init_scale
+
+        self.input_block = torch.nn.Sequential(
+            torch.nn.GroupNorm(num_groups=in_num_groups, num_channels=in_ch, eps=eps), activation,
+        )
+
+        self.middle_conv = torch.nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=3, padding=1)
+        if diffusion_step_embedding_dim is not None:
+            self.diffusion_step_projection = torch.nn.Sequential(
+                activation,
+                torch.nn.Linear(diffusion_step_embedding_dim, out_ch),
+                einops.layers.torch.Rearrange("batch dim -> batch dim 1 1"),
+            )
+
+        self.output_block = torch.nn.Sequential(
+            torch.nn.GroupNorm(num_groups=out_num_groups, num_channels=out_ch, eps=eps),
+            activation,
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Conv2d(in_channels=out_ch, out_channels=out_ch, kernel_size=3, padding=1),
+        )
+
+        if in_ch != out_ch:
+            self.residual_projection = torch.nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1)
+
+        self.act = activation
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+
+        self.init_weights_()
+
+    def init_weights_(self):
+        """Weight initialization
+        """
+        for module in self.modules():
+            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+
+        # a single Conv2d is initialized with gain
+        torch.nn.init.xavier_uniform_(self.output_block[-1].weight, gain=self.init_scale)
+
+    def forward(self, x: torch.Tensor, diffusion_time_embedding: Optional[torch.Tensor] = None):
+        """Forward pass of the model.
+
+        Args:
+            x: input tensor
+            diffusion_time_embedding: embedding of the diffusion time step
+
+        Returns:
+            Output tensor
+        """
+        h = self.input_block(x)
+        h = self.middle_conv(h)
+
+        if diffusion_time_embedding is not None:
+            h = h + self.diffusion_step_projection(diffusion_time_embedding)
+
+        h = self.output_block(h)
+
+        if x.shape != h.shape:  # matching number of channels
+            x = self.residual_projection(x)
+        return (x + h) / math.sqrt(2.0)
+
+
+class PredictorCorrectorSampler(NeuralModule):
+    """Predictor-Corrector sampler for the reverse SDE.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+        predictor: predictor for the reverse process
+        corrector: corrector for the reverse process
+        num_steps: number of time steps for the reverse process
+        num_corrector_steps: number of corrector steps
+        time_max: maximum time
+        time_min: minimum time
+        snr: SNR for Annealed Langevin Dynamics
+        output_type: type of the output ('state' for the final state, or 'mean' for the mean of the final state)
+
+    References:
+        - Song et al., Score-based generative modeling through stochastic differential equations, 2021
+    """
+
+    def __init__(
+        self,
+        sde,
+        score_estimator,
+        predictor: str = 'reverse_diffusion',
+        corrector: str = 'annealed_langevin_dynamics',
+        num_steps: int = 50,
+        num_corrector_steps: int = 1,
+        time_max: Optional[float] = None,
+        time_min: Optional[float] = None,
+        snr: float = 0.5,
+        output_type: str = 'mean',
+    ):
+        super().__init__()
+        # Create a copy of SDE
+        self.sde = sde.copy()
+
+        # Update SDE parameters for sampling
+        if time_max is not None:
+            self.sde.time_max = time_max
+            logging.info('sde.time_max set to: %s', self.sde.time_max)
+
+        if time_min is not None:
+            self.sde.time_min = time_min
+            logging.info('sde.time_min set to: %s', self.sde.time_min)
+
+        self.sde.num_steps = num_steps
+        logging.info('sde.num_steps set to: %s', self.sde.num_steps)
+
+        # Update local values
+        self.time_max = self.sde.time_max
+        self.time_min = self.sde.time_min
+        self.num_steps = self.sde.num_steps
+
+        # Predictor setup
+        if predictor == 'reverse_diffusion':
+            self.predictor = ReverseDiffusionPredictor(sde=self.sde, score_estimator=score_estimator)
+        else:
+            raise RuntimeError(f'Unexpected predictor: {predictor}')
+
+        # Corrector setup
+        if corrector == 'annealed_langevin_dynamics':
+            self.corrector = AnnealedLangevinDynamics(
+                sde=self.sde, score_estimator=score_estimator, snr=snr, num_steps=num_corrector_steps
+            )
+        else:
+            raise RuntimeError(f'Unexpected corrector: {corrector}')
+
+        if output_type not in ['mean', 'state']:
+            raise ValueError(f'Unexpected output type: {output_type}')
+        self.output_type = output_type
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tpredictor:           %s', predictor)
+        logging.debug('\tcorrector:           %s', corrector)
+        logging.debug('\tnum_steps:           %s', self.num_steps)
+        logging.debug('\ttime_min:            %s', self.time_min)
+        logging.debug('\ttime_max:            %s', self.time_max)
+        logging.debug('\tnum_corrector_steps: %s', num_corrector_steps)
+        logging.debug('\tsnr:                 %s', snr)
+        logging.debug('\toutput_type:         %s', self.output_type)
+
+    @typecheck(
+        input_types={
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "score_condition": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType(), optional=True),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={
+            "sample": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+    )
+    @torch.inference_mode()
+    def forward(
+        self, prior_mean: torch.Tensor, score_condition: torch.Tensor, state_length: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Takes prior (noisy) mean and generates a sample by solving the reverse SDE.
+
+        Args:
+            prior_mean: mean for the prior distribution, e.g., noisy observation
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            Generated `sample` and the corresponding `sample_length`.
+        """
+        # Sample from the prior distribution
+        state = self.sde.prior_sampling(prior_mean=prior_mean)
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+
+        # Time steps for evaluation
+        time_steps = torch.linspace(self.time_max, self.time_min, self.num_steps, device=state.device)
+
+        # Sampling
+        for t in time_steps:
+            # time steps for the whole batch
+            time = t * torch.ones(state.shape[0], device=state.device)
+
+            # corrector step
+            state, _ = self.corrector(
+                state=state, time=time, score_condition=score_condition, state_length=state_length
+            )
+
+            # predictor step
+            state, state_mean = self.predictor(
+                state=state,
+                time=time,
+                score_condition=score_condition,
+                prior_mean=prior_mean,
+                state_length=state_length,
+            )
+
+        # Final output
+        if self.output_type == 'state':
+            sample = state
+        elif self.output_type == 'mean':
+            sample = state_mean
+        else:
+            raise RuntimeError(f'Unexpected output type: {self.output_type}')
+
+        if state_length is not None:
+            sample = mask_sequence_tensor(sample, state_length)
+
+        return sample, state_length
+
+
+class Predictor(torch.nn.Module, ABC):
+    """Predictor for the reverse process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+    """
+
+    def __init__(self, sde, score_estimator):
+        super().__init__()
+        self.reverse_sde = ReverseStochasticDifferentialEquation(sde=sde, score_estimator=score_estimator)
+
+    @abstractmethod
+    @torch.inference_mode()
+    def forward(
+        self,
+        *,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """Predict the next state of the reverse process.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean.
+        """
+        pass
+
+
+class ReverseDiffusionPredictor(Predictor):
+    """Predict the next state of the reverse process using the reverse diffusion process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+    """
+
+    def __init__(self, sde, score_estimator):
+        super().__init__(sde=sde, score_estimator=score_estimator)
+
+    @torch.inference_mode()
+    def forward(self, *, state, time, score_condition=None, state_length=None, **kwargs):
+        """Predict the next state of the reverse process using the reverse diffusion process.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean of the diffusion process.
+        """
+        drift, diffusion = self.reverse_sde.discretize(
+            state=state, time=time, score_condition=score_condition, state_length=state_length, **kwargs
+        )
+
+        # Generate a random sample from a standard normal distribution
+        z_norm = torch.randn_like(state)
+
+        # Compute the mean of the next state
+        mean = state - drift
+
+        # Compute new state by sampling
+        new_state = mean + diffusion * z_norm
+
+        if state_length is not None:
+            new_state = mask_sequence_tensor(new_state, state_length)
+            mean = mask_sequence_tensor(mean, state_length)
+
+        return new_state, mean
+
+
+class Corrector(NeuralModule, ABC):
+    """Corrector for the reverse process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+        snr: SNR for Annealed Langevin Dynamics
+        num_steps: number of steps for the corrector
+    """
+
+    def __init__(
+        self,
+        sde: Type[StochasticDifferentialEquation],
+        score_estimator: Type[NeuralModule],
+        snr: float,
+        num_steps: int,
+    ):
+        super().__init__()
+        self.sde = sde
+        self.score_estimator = score_estimator
+        self.snr = snr
+        self.num_steps = num_steps
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tsnr:             %s', snr)
+        logging.debug('\tnum_steps:       %s', num_steps)
+
+    @abstractmethod
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+            "score_condition": NeuralType(('B', 'C', 'D', 'T'), VoidType(), optional=True),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={"state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+    )
+    @torch.inference_mode()
+    def forward(self, state, time, score_condition=None, state_length=None):
+        """
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean.
+        """
+        pass
+
+
+class AnnealedLangevinDynamics(Corrector):
+    """Annealed Langevin Dynamics for the reverse process.
+
+    References:
+        - Song et al., Score-based generative modeling through stochastic differential equations, 2021
+    """
+
+    def __init__(self, sde, **kwargs):
+        if not isinstance(sde, OrnsteinUhlenbeckVarianceExplodingSDE):
+            raise ValueError(f'Expected an instance of OrnsteinUhlenbeckVarianceExplodingSDE, got {type(sde)}')
+        super().__init__(sde=sde, **kwargs)
+
+    @torch.inference_mode()
+    def forward(self, state, time, score_condition=None, state_length=None):
+        """Correct the state using Annealed Langevin Dynamics.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean of the diffusion process.
+
+        References:
+            Alg. 4 in http://arxiv.org/abs/2011.13456
+        """
+        # Compute the standard deviation of the diffusion process
+        std = self.sde.perturb_kernel_std(time=time)
+        # View as [B, 1, 1, 1]
+        std = std.view(-1, *([1] * (state.dim() - 1)))
+
+        for i in range(self.num_steps):
+            # prepare input for the score estimator, concatenate conditioning along the channel dimension
+            score_input = state if score_condition is None else torch.cat([state, score_condition], dim=1)
+
+            # calculate the score
+            score, _ = self.score_estimator(input=score_input, input_length=state_length, condition=time)
+
+            # generate a sample from a standard normal distribution
+            z_norm = torch.randn_like(state)
+
+            # compute the step size
+            # note: this is slightly different than in the paper, where std = ||z_norm||_2 / ||score||_2
+            step_size = 2 * (self.snr * std).pow(2)
+
+            # update the mean
+            mean = state + step_size * score
+
+            # update the state
+            state = mean + z_norm * torch.sqrt(step_size * 2)
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+            mean = mask_sequence_tensor(mean, state_length)
+
+        return state, mean
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index 6a866a617f35..de86132a721b 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -55,21 +55,23 @@ class MultiHeadAttention(nn.Module):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to remove bias in linear and conv layers
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True):
         """Construct an MultiHeadedAttention object."""
         super(MultiHeadAttention, self).__init__()
         self.cache_drop_size = None
+        self.use_bias = use_bias
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
         self.s_d_k = math.sqrt(self.d_k)
         self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)
         self.dropout = nn.Dropout(p=dropout_rate)
 
         self._max_cache_len = max_cache_len
@@ -161,11 +163,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True):
         """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len)
+        super().__init__(
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            max_cache_len=max_cache_len,
+            use_bias=use_bias,
+        )
         # linear transformation for positional encoding
         self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
         # these two learnable biases are used in matrix c and matrix d
@@ -253,7 +262,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
 class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
     """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer.
     Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py)
-    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) 
+    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py)
     Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL),
            https://arxiv.org/abs/2004.05150 (Longformer)
     Args:
@@ -267,6 +276,7 @@ class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
         global_tokens (int): number of tokens to be used for global attention
         global_tokens_spacing (int): how far apart the global tokens are
         global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
     def __init__(
@@ -281,6 +291,7 @@ def __init__(
         global_tokens=0,
         global_tokens_spacing=1,
         global_attn_separate=False,
+        use_bias=True,
     ):
         """Construct an RelPositionMultiHeadAttentionLongformer object."""
         super().__init__(
@@ -290,6 +301,7 @@ def __init__(
             pos_bias_u=pos_bias_u,
             pos_bias_v=pos_bias_v,
             max_cache_len=max_cache_len,
+            use_bias=use_bias,
         )
         self.att_context_size = att_context_size
         self.global_tokens = global_tokens
@@ -297,9 +309,9 @@ def __init__(
         self.global_attn_separate = global_attn_separate
 
         if self.global_attn_separate:
-            self.global_q = nn.Linear(n_feat, n_feat)
-            self.global_k = nn.Linear(n_feat, n_feat)
-            self.global_v = nn.Linear(n_feat, n_feat)
+            self.global_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_v = nn.Linear(n_feat, n_feat, bias=use_bias)
 
     def forward(self, query, key, value, pad_mask, pos_emb, cache=None):
         """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks
@@ -650,13 +662,17 @@ def _compute_out_global_to_all(
         global_attn_scores = global_attn_scores.transpose(1, 2)
 
         global_attn_scores = global_attn_scores.masked_fill(
-            is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min,
+            is_index_masked.transpose(2, 3),
+            torch.finfo(global_attn_scores.dtype).min,
         )
 
         global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len)
 
         # compute global attn probs
-        global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1, dtype=torch.float32)
+        if self.training:
+            global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1, dtype=torch.float32)
+        else:
+            global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1)
 
         global_attn_probs = self.dropout(global_attn_probs_float)
 
@@ -747,7 +763,9 @@ def _get_invalid_locations_mask(self, w: int, device: str):
         return mask.bool().to(device), ending_mask
 
     def mask_invalid_locations(
-        self, input_tensor: torch.Tensor, w: int,
+        self,
+        input_tensor: torch.Tensor,
+        w: int,
     ):
         """
         Mask locations invalid for the sliding window attention
@@ -891,7 +909,7 @@ def __init__(self, d_model, dropout_rate, max_len=5000, xscale=None, dropout_rat
         else:
             self.dropout_emb = None
 
-    def create_pe(self, positions):
+    def create_pe(self, positions, dtype):
         pos_length = positions.size(0)
         pe = torch.zeros(pos_length, self.d_model, device=positions.device)
         div_term = torch.exp(
@@ -900,18 +918,18 @@ def create_pe(self, positions):
         )
         pe[:, 0::2] = torch.sin(positions * div_term)
         pe[:, 1::2] = torch.cos(positions * div_term)
-        pe = pe.unsqueeze(0)
+        pe = pe.unsqueeze(0).to(dtype)
         if hasattr(self, 'pe'):
             self.pe = pe
         else:
             self.register_buffer('pe', pe, persistent=False)
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings if needed."""
         if hasattr(self, 'pe') and self.pe.size(1) >= length:
             return
         positions = torch.arange(0, length, dtype=torch.float32, device=device).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x: torch.Tensor, cache_len=0):
         """Adds positional encoding.
@@ -943,7 +961,7 @@ class RelPositionalEncoding(PositionalEncoding):
         dropout_rate_emb (float): dropout rate for the positional embeddings
     """
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings if needed."""
         needed_size = 2 * length - 1
         if hasattr(self, 'pe') and self.pe.size(1) >= needed_size:
@@ -951,7 +969,7 @@ def extend_pe(self, length, device):
         # positions would be from negative numbers to positive
         # positive positions would be used for left positions and negative for right positions
         positions = torch.arange(length - 1, -length, -1, dtype=torch.float32, device=device).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x, cache_len=0):
         """Compute positional encoding.
@@ -997,7 +1015,7 @@ def __init__(self, att_context_size, **kwargs):
         self.left_context = att_context_size[0]
         self.right_context = att_context_size[1]
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings only at the beginning"""
         if hasattr(self, 'pe'):
             return
@@ -1005,7 +1023,7 @@ def extend_pe(self, length, device):
         positions = torch.arange(
             self.left_context, -self.right_context - 1, -1, dtype=torch.float32, device=device
         ).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x, cache_len=0):
         """Compute positional encoding.
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 71079f4b6382..eb4088f84cae 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -331,7 +331,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
                         loop_labels=self.cfg.greedy.get('loop_labels', True),
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -347,7 +347,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         include_duration_confidence=self.tdt_include_duration_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
 
             else:
@@ -1175,7 +1175,11 @@ class RNNTDecoding(AbstractRNNTDecoding):
     """
 
     def __init__(
-        self, decoding_cfg, decoder, joint, vocabulary,
+        self,
+        decoding_cfg,
+        decoder,
+        joint,
+        vocabulary,
     ):
         # we need to ensure blank is the last token in the vocab for the case of RNNT and Multi-blank RNNT.
         blank_id = len(vocabulary) + joint.num_extra_outputs
@@ -1186,7 +1190,10 @@ def __init__(
         self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))])
 
         super(RNNTDecoding, self).__init__(
-            decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id,
+            decoding_cfg=decoding_cfg,
+            decoder=decoder,
+            joint=joint,
+            blank_id=blank_id,
         )
 
         if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index e5de99cf0776..420e49c96142 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -38,13 +38,17 @@
 from nemo.collections.asr.parts.submodules.tdt_loop_labels_computer import GreedyBatchedTDTLoopLabelsComputer
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.collections.common.parts.rnn import label_collate
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType
 from nemo.utils import logging
 
 
-def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]:
+def pack_hypotheses(
+    hypotheses: List[rnnt_utils.Hypothesis],
+    logitlen: torch.Tensor,
+) -> List[rnnt_utils.Hypothesis]:
 
     if hasattr(logitlen, 'cpu'):
         logitlen_cpu = logitlen.to('cpu')
@@ -138,8 +142,7 @@ class _GreedyRNNTInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
             "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
@@ -148,8 +151,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -508,7 +510,7 @@ def _greedy_decode(
         return hypothesis
 
 
-class GreedyBatchedRNNTInfer(_GreedyRNNTInfer):
+class GreedyBatchedRNNTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
     """A batch level greedy transducer decoder.
 
     Batch level greedy decoding, performed auto-regressively.
@@ -577,6 +579,7 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer):
             (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
             to prediction network (with maximum possible batch size),
             which makes it especially useful for scaling the prediction network.
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
@@ -589,7 +592,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -602,13 +605,14 @@ def __init__(
         )
 
         self.use_cuda_graph_decoder = use_cuda_graph_decoder
+        self.loop_labels = loop_labels
 
         # Depending on availability of `blank_as_pad` support
         # switch between more efficient batch decoding technique
         self._decoding_computer = None
         if self.decoder.blank_as_pad:
-            if loop_labels:
-                # default (faster) algo: loop over labels
+            if self.loop_labels:
+                # Label-Looping algorithm (default, faster)
                 self._greedy_decode = self._greedy_decode_blank_as_pad_loop_labels
                 self._decoding_computer = GreedyBatchedRNNTLoopLabelsComputer(
                     decoder=self.decoder,
@@ -618,20 +622,74 @@ def __init__(
                     preserve_alignments=preserve_alignments,
                     preserve_frame_confidence=preserve_frame_confidence,
                     confidence_method_cfg=confidence_method_cfg,
-                    allow_cuda_graphs=use_cuda_graph_decoder,
-                )
-            elif use_cuda_graph_decoder:
-                from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import (
-                    RNNTGreedyDecodeCudaGraph,
+                    allow_cuda_graphs=self.use_cuda_graph_decoder,
                 )
-
-                self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
             else:
-                # previous algo: loop over frames
-                self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                # Frame-Looping algorithm
+                if not self.use_cuda_graph_decoder:
+                    self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                else:
+                    if self.preserve_alignments:
+                        logging.warning("`preserve_alignments` is not implemented for Frame-Looping + CUDA graphs")
+                        self.use_cuda_graph_decoder = False
+                    if self.preserve_frame_confidence:
+                        logging.warning(
+                            "`preserve_frame_confidence` is not implemented for Frame-Looping + CUDA graphs"
+                        )
+                        self.use_cuda_graph_decoder = False
+                    if not torch.cuda.is_available():
+                        self.use_cuda_graph_decoder = False
+
+                    if self.use_cuda_graph_decoder:
+                        try:
+                            from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import (
+                                RNNTGreedyDecodeCudaGraph,
+                            )
+
+                            self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
+                        except (ImportError, ModuleNotFoundError, ValueError) as e:
+                            self.use_cuda_graph_decoder = False
+                            logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}")
+                            self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                    else:
+                        self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
         else:
             self._greedy_decode = self._greedy_decode_masked
 
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if not self.use_cuda_graph_decoder:
+            # CUDA graphs not allowed, nothing to do
+            return
+
+        if not self.decoder.blank_as_pad:
+            # blank as pad uses decoding without CUDA graphs
+            return
+
+        if self.loop_labels:
+            # Label-Looping implementation
+            self._decoding_computer.disable_cuda_graphs()
+        else:
+            self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if not self.use_cuda_graph_decoder:
+            # CUDA graphs not allowed, nothing to do
+            return
+
+        if not self.decoder.blank_as_pad:
+            # blank as pad uses decoding without CUDA graphs
+            return
+
+        if self.loop_labels:
+            # Label-Looping implementation
+            self._decoding_computer.maybe_enable_cuda_graphs()
+        else:
+            from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import RNNTGreedyDecodeCudaGraph
+
+            self._greedy_decode = RNNTGreedyDecodeCudaGraph(self.max_symbols, self)
+
     @typecheck()
     def forward(
         self,
@@ -2302,7 +2360,7 @@ class GreedyBatchedRNNTInferConfig:
     tdt_include_duration_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
     loop_labels: bool = True
-    use_cuda_graph_decoder: bool = False
+    use_cuda_graph_decoder: bool = True
 
     def __post_init__(self):
         # OmegaConf.structured ensures that post_init check is always executed
@@ -2580,7 +2638,7 @@ def _greedy_decode(
         return hypothesis
 
 
-class GreedyBatchedTDTInfer(_GreedyRNNTInfer):
+class GreedyBatchedTDTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
     """A batch level greedy TDT decoder.
     Batch level greedy decoding, performed auto-regressively.
     Args:
@@ -2639,6 +2697,8 @@ class GreedyBatchedTDTInfer(_GreedyRNNTInfer):
                 Supported values:
                     - 'lin' for using the linear mapping.
                     - 'exp' for using exponential mapping with linear shift.
+
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
@@ -2652,7 +2712,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2759,3 +2819,13 @@ def _greedy_decode_blank_as_pad_loop_labels(
         for hyp, state in zip(hyps, self.decoder.batch_split_states(last_decoder_state)):
             hyp.dec_state = state
         return hyps
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if self._decoding_computer is not None:
+            self._decoding_computer.disable_cuda_graphs()
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if self._decoding_computer is not None:
+            self._decoding_computer.maybe_enable_cuda_graphs()
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 92cb8a36aeb5..c0783c301c44 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional, Tuple
+from dataclasses import dataclass, field
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -21,6 +22,7 @@
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.utils.cuda_python_utils import (
     check_cuda_python_cuda_graphs_conditional_nodes_supported,
     cu_call,
@@ -28,6 +30,7 @@
     with_conditional_node,
 )
 from nemo.utils import logging
+from nemo.utils.enum import PrettyStrEnum
 
 try:
     from cuda import cudart
@@ -109,7 +112,9 @@ def __init__(
         self.max_time = max_time
 
         self.encoder_output_projected = torch.zeros(
-            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+            (self.batch_size, self.max_time, encoder_dim),
+            dtype=float_dtype,
+            device=self.device,
         )
         self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
 
@@ -161,7 +166,17 @@ def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
         )
 
 
-class GreedyBatchedRNNTLoopLabelsComputer(ConfidenceMethodMixin):
+@dataclass
+class SeparateGraphsLoopLabels:
+    """Class to store Cuda graphs for decoding when separate graphs are used"""
+
+    before_outer_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    before_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    inner_loop_code: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    after_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+
+
+class GreedyBatchedRNNTLoopLabelsComputer(WithOptionalCudaGraphs, ConfidenceMethodMixin):
     """
     Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
     Iterates over labels, on each step finding the next non-blank label
@@ -174,6 +189,16 @@ class GreedyBatchedRNNTLoopLabelsComputer(ConfidenceMethodMixin):
     INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
     CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_rnnt.cu"
 
+    class CudaGraphsMode(PrettyStrEnum):
+        FULL_GRAPH = "full_graph"  # Cuda graphs with conditional nodes, fastest implementation
+        NO_WHILE_LOOPS = "no_while_loops"  # Decoding with PyTorch while loops + partial Cuda graphs
+        NO_GRAPHS = "no_graphs"  # decoding without graphs, stateful implementation, only for testing purposes
+
+    separate_graphs: Optional[SeparateGraphsLoopLabels]
+    full_graph: Optional[torch.cuda.CUDAGraph]
+    cuda_graphs_mode: Optional[CudaGraphsMode]
+    state: Optional[LoopLabelsState]
+
     def __init__(
         self,
         decoder,
@@ -203,27 +228,71 @@ def __init__(
         self.max_symbols = max_symbols_per_step
         self.preserve_alignments = preserve_alignments
         self.preserve_frame_confidence = preserve_frame_confidence
+        self.allow_cuda_graphs = allow_cuda_graphs
         self._SOS = self._blank_index
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-        self.use_cuda_graphs = allow_cuda_graphs
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
 
-        if self.use_cuda_graphs and self.max_symbols is None:
-            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
-            self.use_cuda_graphs = False
+        self.cuda_graphs_mode = None
+        self.maybe_enable_cuda_graphs()
 
-        if self.use_cuda_graphs:
+    def force_cuda_graphs_mode(self, mode: Optional[Union[str, CudaGraphsMode]]):
+        """
+        Method to set graphs mode. Use only for testing purposes.
+        For debugging the algorithm use "no_graphs" mode, since it is impossible to debug CUDA graphs directly.
+        """
+        self.cuda_graphs_mode = self.CudaGraphsMode(mode) if mode is not None else None
+        self.state = None
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if conditions met"""
+        if self.cuda_graphs_mode is not None:
+            # CUDA graphs are already enabled
+            return
+
+        if not self.allow_cuda_graphs:
+            self.cuda_graphs_mode = None
+        else:
+            # cuda graphs are allowed
+            # check basic requirements for cuda graphs
+            if self.max_symbols is None:
+                logging.warning("Max symbols per step is None, which is not allowed with Cuda graphs. Setting to `10`")
+                self.max_symbols = 10
+            # basic requirements met, need to check while loops
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
-            except ImportError as e:
-                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
-                self.use_cuda_graphs = False
-
-        self.state: Optional[LoopLabelsState] = None
+                self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
+            except (ImportError, ModuleNotFoundError) as e:
+                logging.warning(
+                    "No conditional node support for Cuda.\n"
+                    "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
+                    f"Reason: {e.msg}"
+                )
+                self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
+        self.reset_cuda_graphs_state()
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs, can be used to disable graphs temporary, e.g., in training process"""
+        if self.cuda_graphs_mode is None:
+            # nothing to disable
+            return
+        self.cuda_graphs_mode = None
+        self.reset_cuda_graphs_state()
+
+    def reset_cuda_graphs_state(self):
+        """Reset state to release memory (for CUDA graphs implementations)"""
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
 
     def loop_labels_torch(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Pure PyTorch implementation
@@ -237,6 +306,7 @@ def loop_labels_torch(
 
         # do not recalculate joint projection, project only once
         encoder_output_projected = self.joint.project_encoder(encoder_output)
+        float_dtype = encoder_output_projected.dtype
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
@@ -244,7 +314,7 @@ def loop_labels_torch(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
             device=device,
-            float_dtype=encoder_output_projected.dtype,
+            float_dtype=float_dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
         last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
@@ -256,7 +326,7 @@ def loop_labels_torch(
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
             device=device,
-            float_dtype=encoder_output_projected.dtype,
+            float_dtype=float_dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
         )
@@ -295,7 +365,8 @@ def loop_labels_torch(
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
                 self.joint.joint_after_projection(
-                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                    decoder_output,
                 )
                 .squeeze(1)
                 .squeeze(1)
@@ -312,9 +383,11 @@ def loop_labels_torch(
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
-                    if self.preserve_frame_confidence
-                    else None,
+                    confidence=(
+                        self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    ),
                 )
 
             # advance_mask is a mask for current batch for searching non-blank labels;
@@ -331,7 +404,8 @@ def loop_labels_torch(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                        decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -350,9 +424,11 @@ def loop_labels_torch(
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
-                        if self.preserve_frame_confidence
-                        else None,
+                        confidence=(
+                            self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                            if self.preserve_frame_confidence
+                            else None
+                        ),
                     )
 
                 blank_mask = labels == self._blank_index
@@ -366,19 +442,27 @@ def loop_labels_torch(
             # this seems to be redundant, but used in the `loop_frames` output
             torch.ne(active_mask, active_mask_prev, out=became_inactive_mask)
             self.decoder.batch_replace_states_mask(
-                src_states=state, dst_states=last_decoder_state, mask=became_inactive_mask,
+                src_states=state,
+                dst_states=last_decoder_state,
+                mask=became_inactive_mask,
             )
 
             # store hypotheses
             if self.max_symbols is not None:
                 # pre-allocated memory, no need for checks
                 batched_hyps.add_results_masked_no_checks_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
             else:
                 # auto-adjusted storage
                 batched_hyps.add_results_masked_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
 
             # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -389,7 +473,8 @@ def loop_labels_torch(
                     active_mask,
                     torch.logical_and(
                         torch.logical_and(
-                            labels != self._blank_index, batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            labels != self._blank_index,
+                            batched_hyps.last_timestep_lasts >= self.max_symbols,
                         ),
                         batched_hyps.last_timestep == time_indices,
                     ),
@@ -404,7 +489,9 @@ def loop_labels_torch(
         return batched_hyps, None, last_decoder_state
 
     def loop_labels_cuda_graphs(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Implementation with CUDA graphs.
@@ -413,6 +500,8 @@ def loop_labels_cuda_graphs(
             encoder_output: output from the encoder
             encoder_output_length: lengths of the utterances in `encoder_output`
         """
+        assert self.cuda_graphs_mode is not None
+
         # do not recalculate joint projection, project only once
         encoder_output = self.joint.project_encoder(encoder_output)
         current_batch_size = encoder_output.shape[0]
@@ -430,16 +519,27 @@ def loop_labels_cuda_graphs(
         self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
         # set length to zero for elements outside the current batch
         self.state.encoder_output_length[current_batch_size:].fill_(0)
-        self.graph.replay()
-
-        # example manual loop (can be used instead of graph.replay())
-        # self._before_outer_loop()
-        # while self.state.active_mask_any.item():
-        #     self._before_inner_loop_get_decoder_output()
-        #     self._before_inner_loop_get_joint_output()
-        #     while self.state.advance_mask_any.item():
-        #         self._inner_loop_code()
-        #     self._after_inner_loop()
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self.full_graph.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self.separate_graphs.before_outer_loop.replay()
+            while self.state.active_mask_any.item():
+                self.separate_graphs.before_inner_loop.replay()
+                while self.state.advance_mask_any.item():
+                    self.separate_graphs.inner_loop_code.replay()
+                self.separate_graphs.after_inner_loop.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # this mode is only for testing purposes
+            # manual loop instead of using graphs
+            self._before_outer_loop()
+            while self.state.active_mask_any.item():
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                while self.state.advance_mask_any.item():
+                    self._inner_loop_code()
+                self._after_inner_loop()
+        else:
+            raise NotImplementedError(f"Unknown graph mode: {self.cuda_graphs_mode}")
 
         return (
             self.state.batched_hyps,
@@ -486,7 +586,9 @@ def _create_inner_while_loop_kernel(cls):
         return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
 
     def _graph_reinitialize(
-        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output_projected: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ):
         batch_size, max_time, encoder_dim = encoder_output_projected.shape
 
@@ -509,12 +611,68 @@ def _graph_reinitialize(
         )
         # to avoid recalculation of joint projection, store decoder output in state
         self.state.decoder_output = self.joint.project_prednet(decoder_output)
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self._full_graph_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self._partial_graphs_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # no graphs needed
+            pass
+        else:
+            raise NotImplementedError
+
+    def _partial_graphs_compile(self):
+        """Compile decoding by parts"""
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
+        self.separate_graphs = SeparateGraphsLoopLabels()
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._before_outer_loop()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._before_inner_loop_get_decoder_output()
+            self._before_inner_loop_get_joint_output()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._inner_loop_code()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._after_inner_loop()
 
+    def _full_graph_compile(self):
+        """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        self.graph = torch.cuda.CUDAGraph()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+        self.full_graph = torch.cuda.CUDAGraph()
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 
@@ -528,7 +686,8 @@ def _graph_reinitialize(
             outer_loop_kernel = self._create_outer_while_loop_kernel()
             active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
             outer_loop_args = np.array(
-                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data],
+                dtype=np.uint64,
             )
             # loop while there are active utterances
             with with_conditional_node(
@@ -541,7 +700,11 @@ def _graph_reinitialize(
                 (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
                 advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
                 inner_loop_args = np.array(
-                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                    [
+                        inner_loop_conditional_handle.getPtr(),
+                        advance_mask_any_ptr.ctypes.data,
+                    ],
+                    dtype=np.uint64,
                 )
                 with with_conditional_node(
                     inner_while_loop_kernel, inner_loop_args, inner_loop_conditional_handle, device=self.state.device
@@ -612,14 +775,17 @@ def _before_inner_loop_get_joint_output(self):
         # blank_mask = self.labels == self._blank_index
         self.state.time_indices_current_labels.copy_(self.state.time_indices, non_blocking=True)
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.active_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=self.state.labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
-                if self.preserve_frame_confidence
-                else None,
+                confidence=(
+                    self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                    if self.preserve_frame_confidence
+                    else None
+                ),
             )
 
         # advance_mask is a mask for current batch for searching non-blank labels;
@@ -662,14 +828,17 @@ def _inner_loop_code(self):
         torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
 
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.advance_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=more_labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
-                if self.preserve_frame_confidence
-                else None,
+                confidence=(
+                    self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                    if self.preserve_frame_confidence
+                    else None
+                ),
             )
 
         # blank_mask = self.labels == self._blank_index
@@ -695,7 +864,10 @@ def _after_inner_loop(self):
         )
 
         self.state.batched_hyps.add_results_masked_no_checks_(
-            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+            self.state.active_mask,
+            self.state.labels,
+            self.state.time_indices_current_labels,
+            self.state.scores,
         )
 
         # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -719,9 +891,11 @@ def _after_inner_loop(self):
         torch.any(self.state.active_mask, out=self.state.active_mask_any)
 
     def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self,
+        x: torch.Tensor,
+        out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
-        if self.use_cuda_graphs and x.device.type == "cuda":
+        if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
 
         return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/submodules/spectr_augment.py b/nemo/collections/asr/parts/submodules/spectr_augment.py
index 9b379ce10f37..5bc7104816af 100644
--- a/nemo/collections/asr/parts/submodules/spectr_augment.py
+++ b/nemo/collections/asr/parts/submodules/spectr_augment.py
@@ -38,12 +38,18 @@ class SpecAugment(nn.Module, Typing):
         to be cut in one segment.
         If a float value, defines maximum percentage of timesteps that
         are cut adaptively.
+    use_vectorized_code - GPU-based implementation with batched masking and GPU rng,
+        setting it to False reverts to the legacy implementation.
+        Fast implementation is inspired by torchaudio:
+        https://github.com/pytorch/audio/blob/ea437b31ce316ea3d66fe73768c0dcb94edb79ad/src/torchaudio/functional/functional.py#L816
     """
 
+    FREQ_AXIS = 1  # Frequency axis in the spectrogram tensor
+    TIME_AXIS = 2  # Time axis in the spectrogram tensor
+
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -51,12 +57,18 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
-        self, freq_masks=0, time_masks=0, freq_width=10, time_width=10, rng=None, mask_value=0.0,
+        self,
+        freq_masks: int = 0,
+        time_masks: int = 0,
+        freq_width: int = 10,
+        time_width: int | float = 10,
+        rng: random.Random | None = None,
+        mask_value: float = 0.0,
+        use_vectorized_code: bool = True,
     ):
         super().__init__()
 
@@ -69,6 +81,7 @@ def __init__(
         self.time_width = time_width
 
         self.mask_value = mask_value
+        self.use_vectorized_code = use_vectorized_code
 
         if isinstance(time_width, int):
             self.adaptive_temporal_width = False
@@ -81,6 +94,12 @@ def __init__(
     @typecheck()
     @torch.no_grad()
     def forward(self, input_spec, length):
+        if self.use_vectorized_code:
+            return self._forward_vectorized(input_spec, length)
+        else:
+            return self._forward_legacy(input_spec, length)
+
+    def _forward_legacy(self, input_spec, length):
         batch_size, num_freq_bins, _ = input_spec.shape
         # Move lengths to CPU before repeated indexing
         lengths_cpu = length.cpu().numpy()
@@ -112,6 +131,89 @@ def forward(self, input_spec, length):
         masked_spec = input_spec.masked_fill(mask=fill_mask, value=self.mask_value)
         return masked_spec
 
+    def _forward_vectorized(self, input_spec: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        # time masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.time_masks,
+            length=length,
+            width=self.time_width,
+            axis=self.TIME_AXIS,
+            mask_value=self.mask_value,
+        )
+        # freq masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.freq_masks,
+            length=length,
+            width=self.freq_width,
+            axis=self.FREQ_AXIS,
+            mask_value=self.mask_value,
+        )
+        return input_spec
+
+    def _apply_masks(
+        self,
+        input_spec: torch.Tensor,
+        num_masks: int,
+        length: torch.Tensor,
+        width: int | float,
+        mask_value: float,
+        axis: int,
+    ) -> torch.Tensor:
+
+        assert axis in (
+            self.FREQ_AXIS,
+            self.TIME_AXIS,
+        ), f"Axis can be only be equal to frequency \
+            ({self.FREQ_AXIS}) or time ({self.TIME_AXIS}). Received: {axis=}"
+        assert not (
+            isinstance(width, float) and axis == self.FREQ_AXIS
+        ), "Float width supported \
+            only with time axis."
+
+        batch_size = input_spec.shape[0]
+        axis_length = input_spec.shape[axis]
+
+        # If width is float then it is transformed into a tensor
+        if axis == self.TIME_AXIS and isinstance(width, float):
+            width = torch.clamp(width * length, max=axis_length).unsqueeze(1)
+
+        # Generate [0-1) random numbers and then scale the tensors.
+        # Use float32 dtype for begin/end mask markers before they are quantized to long.
+        mask_width = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32) * width
+        mask_width = mask_width.long()
+        mask_start = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32)
+
+        if axis == self.TIME_AXIS:
+            # length can only be used for the time axis
+            mask_start = mask_start * (length.unsqueeze(1) - mask_width)
+        else:
+            mask_start = mask_start * (axis_length - mask_width)
+
+        mask_start = mask_start.long()
+        mask_end = mask_start + mask_width
+
+        # Create mask values using vectorized indexing
+        indices = torch.arange(axis_length, device=input_spec.device)
+        # Create a mask_tensor with all the indices.
+        # The mask_tensor shape is (batch_size, num_masks, axis_length).
+        mask_tensor = (indices >= mask_start.unsqueeze(-1)) & (indices < mask_end.unsqueeze(-1))
+
+        # Reduce masks to one mask
+        mask_tensor = mask_tensor.any(dim=1)
+
+        # Create a final mask that aligns with the full tensor
+        mask = torch.zeros_like(input_spec, dtype=torch.bool)
+        if axis == self.TIME_AXIS:
+            mask_ranges = mask_tensor[:, None, :]
+        else:  # axis == self.FREQ_AXIS
+            mask_ranges = mask_tensor[:, :, None]
+        mask[:, :, :] = mask_ranges
+
+        # Apply the mask value
+        return input_spec.masked_fill(mask=mask, value=mask_value)
+
 
 class SpecCutout(nn.Module, Typing):
     """
@@ -126,14 +228,12 @@ class SpecCutout(nn.Module, Typing):
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {"input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None):
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index b136446d97fb..4132c453d570 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from dataclasses import dataclass, field
 from typing import Any, Optional, Tuple, Union
 
 import numpy as np
@@ -22,6 +23,7 @@
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.utils.cuda_python_utils import (
     check_cuda_python_cuda_graphs_conditional_nodes_supported,
     cu_call,
@@ -29,6 +31,7 @@
     with_conditional_node,
 )
 from nemo.utils import logging
+from nemo.utils.enum import PrettyStrEnum
 
 try:
     from cuda import cudart
@@ -114,7 +117,9 @@ def __init__(
         self.max_time = max_time
 
         self.encoder_output_projected = torch.zeros(
-            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+            (self.batch_size, self.max_time, encoder_dim),
+            dtype=float_dtype,
+            device=self.device,
         )
         self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
 
@@ -167,7 +172,17 @@ def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
         )
 
 
-class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
+@dataclass
+class SeparateGraphsLoopLabels:
+    """Class to store Cuda graphs for decoding when separate graphs are used"""
+
+    before_outer_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    before_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    inner_loop_code: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    after_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+
+
+class GreedyBatchedTDTLoopLabelsComputer(WithOptionalCudaGraphs, ConfidenceMethodMixin):
     """
     Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
     Iterates over labels, on each step finding the next non-blank label
@@ -180,6 +195,16 @@ class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
     INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
     CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_tdt.cu"
 
+    class CudaGraphsMode(PrettyStrEnum):
+        FULL_GRAPH = "full_graph"  # Cuda graphs with conditional nodes, fastest implementation
+        NO_WHILE_LOOPS = "no_while_loops"  # Decoding with PyTorch while loops + partial Cuda graphs
+        NO_GRAPHS = "no_graphs"  # decoding without graphs, stateful implementation, only for testing purposes
+
+    separate_graphs: Optional[SeparateGraphsLoopLabels]
+    full_graph: Optional[torch.cuda.CUDAGraph]
+    cuda_graphs_mode: Optional[CudaGraphsMode]
+    state: Optional[LoopLabelsState]
+
     def __init__(
         self,
         decoder,
@@ -215,28 +240,72 @@ def __init__(
         self.max_symbols = max_symbols_per_step
         self.preserve_alignments = preserve_alignments
         self.preserve_frame_confidence = preserve_frame_confidence
+        self.allow_cuda_graphs = allow_cuda_graphs
         self.include_duration_confidence = include_duration_confidence
         self._SOS = self._blank_index
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-        self.use_cuda_graphs = allow_cuda_graphs
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
+
+        self.cuda_graphs_mode = None
+        self.maybe_enable_cuda_graphs()
 
-        if self.use_cuda_graphs and self.max_symbols is None:
-            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
-            self.use_cuda_graphs = False
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if conditions met"""
+        if self.cuda_graphs_mode is not None:
+            # CUDA graphs are enabled
+            return
 
-        if self.use_cuda_graphs:
+        if not self.allow_cuda_graphs:
+            self.cuda_graphs_mode = None
+        else:
+            # cuda graphs are allowed
+            # check basic requirements for cuda graphs
+            if self.max_symbols is None:
+                logging.warning("Max symbols per step is None, which is not allowed with Cuda graphs. Setting to `10`")
+                self.max_symbols = 10
+            # basic requirements met, need to check while loops
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
-            except ImportError as e:
-                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
-                self.use_cuda_graphs = False
-
-        self.state: Optional[LoopLabelsState] = None
+                self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
+            except (ImportError, ModuleNotFoundError) as e:
+                logging.warning(
+                    "No conditional node support for Cuda.\n"
+                    "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
+                    f"Reason: {e.msg}"
+                )
+                self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
+        self.reset_cuda_graphs_state()
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs, can be used to disable graphs temporary, e.g., in training process"""
+        if self.cuda_graphs_mode is None:
+            # nothing to disable
+            return
+        self.cuda_graphs_mode = None
+        self.reset_cuda_graphs_state()
+
+    def reset_cuda_graphs_state(self):
+        """Reset state to release memory (for CUDA graphs implementations)"""
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
+
+    def force_cuda_graphs_mode(self, mode: Optional[Union[str, CudaGraphsMode]]):
+        """
+        Method to set graphs mode. Use only for testing purposes.
+        For debugging the algorithm use "no_graphs" mode, since it is impossible to debug CUDA graphs directly.
+        """
+        self.cuda_graphs_mode = self.CudaGraphsMode(mode) if mode is not None else None
+        self.state = None
 
     def loop_labels_torch(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Pure PyTorch implementation
@@ -250,7 +319,7 @@ def loop_labels_torch(
 
         # do not recalculate joint projection, project only once
         encoder_output_projected = self.joint.project_encoder(encoder_output)
-        dtype = encoder_output_projected.dtype
+        float_dtype = encoder_output_projected.dtype
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
@@ -258,7 +327,7 @@ def loop_labels_torch(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
             device=device,
-            float_dtype=dtype,
+            float_dtype=float_dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
         last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
@@ -270,7 +339,7 @@ def loop_labels_torch(
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
             device=device,
-            float_dtype=dtype,
+            float_dtype=float_dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
             with_duration_confidence=self.include_duration_confidence,
@@ -314,7 +383,8 @@ def loop_labels_torch(
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
                 self.joint.joint_after_projection(
-                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                    decoder_output,
                 )
                 .squeeze(1)
                 .squeeze(1)
@@ -335,21 +405,27 @@ def loop_labels_torch(
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=torch.stack(
-                        (
-                            self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                dtype=dtype
-                            ),
-                            self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
-                                dtype=dtype
+                    confidence=(
+                        torch.stack(
+                            (
+                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                    dtype=float_dtype
+                                ),
+                                self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                                    dtype=float_dtype
+                                ),
                             ),
-                        ),
-                        dim=-1,
-                    )
-                    if self.include_duration_confidence
-                    else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(dtype=dtype)
-                    if self.preserve_frame_confidence
-                    else None,
+                            dim=-1,
+                        )
+                        if self.include_duration_confidence
+                        else (
+                            self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                dtype=float_dtype
+                            )
+                            if self.preserve_frame_confidence
+                            else None
+                        )
+                    ),
                 )
 
             # advance_mask is a mask for current batch for searching non-blank labels;
@@ -366,7 +442,8 @@ def loop_labels_torch(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                        decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -387,23 +464,27 @@ def loop_labels_torch(
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=torch.stack(
-                            (
-                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                    dtype=dtype
-                                ),
-                                self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
-                                    dtype=dtype
+                        confidence=(
+                            torch.stack(
+                                (
+                                    self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                        dtype=float_dtype
+                                    ),
+                                    self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                                        dtype=float_dtype
+                                    ),
                                 ),
-                            ),
-                            dim=-1,
-                        )
-                        if self.include_duration_confidence
-                        else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                            dtype=dtype
-                        )
-                        if self.preserve_frame_confidence
-                        else None,
+                                dim=-1,
+                            )
+                            if self.include_duration_confidence
+                            else (
+                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                    dtype=float_dtype
+                                )
+                                if self.preserve_frame_confidence
+                                else None
+                            )
+                        ),
                     )
 
                 blank_mask = labels == self._blank_index
@@ -420,19 +501,27 @@ def loop_labels_torch(
             # this seems to be redundant, but used in the `loop_frames` output
             torch.ne(active_mask, active_mask_prev, out=became_inactive_mask)
             self.decoder.batch_replace_states_mask(
-                src_states=state, dst_states=last_decoder_state, mask=became_inactive_mask,
+                src_states=state,
+                dst_states=last_decoder_state,
+                mask=became_inactive_mask,
             )
 
             # store hypotheses
             if self.max_symbols is not None:
                 # pre-allocated memory, no need for checks
                 batched_hyps.add_results_masked_no_checks_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
             else:
                 # auto-adjusted storage
                 batched_hyps.add_results_masked_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
 
             # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -443,7 +532,8 @@ def loop_labels_torch(
                     active_mask,
                     torch.logical_and(
                         torch.logical_and(
-                            labels != self._blank_index, batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            labels != self._blank_index,
+                            batched_hyps.last_timestep_lasts >= self.max_symbols,
                         ),
                         batched_hyps.last_timestep == time_indices,
                     ),
@@ -458,7 +548,9 @@ def loop_labels_torch(
         return batched_hyps, None, last_decoder_state
 
     def loop_labels_cuda_graphs(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Implementation with CUDA graphs.
@@ -467,6 +559,8 @@ def loop_labels_cuda_graphs(
             encoder_output: output from the encoder
             encoder_output_length: lengths of the utterances in `encoder_output`
         """
+        assert self.cuda_graphs_mode is not None
+
         # do not recalculate joint projection, project only once
         encoder_output = self.joint.project_encoder(encoder_output)
         current_batch_size = encoder_output.shape[0]
@@ -484,16 +578,27 @@ def loop_labels_cuda_graphs(
         self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
         # set length to zero for elements outside the current batch
         self.state.encoder_output_length[current_batch_size:].fill_(0)
-        self.graph.replay()
-
-        # example manual loop (can be used instead of graph.replay())
-        # self._before_outer_loop()
-        # while self.state.active_mask_any.item():
-        #     self._before_inner_loop_get_decoder_output()
-        #     self._before_inner_loop_get_joint_output()
-        #     while self.state.advance_mask_any.item():
-        #         self._inner_loop_code()
-        #     self._after_inner_loop()
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self.full_graph.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self.separate_graphs.before_outer_loop.replay()
+            while self.state.active_mask_any.item():
+                self.separate_graphs.before_inner_loop.replay()
+                while self.state.advance_mask_any.item():
+                    self.separate_graphs.inner_loop_code.replay()
+                self.separate_graphs.after_inner_loop.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # this mode is only for testing purposes
+            # manual loop instead of using graphs
+            self._before_outer_loop()
+            while self.state.active_mask_any.item():
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                while self.state.advance_mask_any.item():
+                    self._inner_loop_code()
+                self._after_inner_loop()
+        else:
+            raise NotImplementedError(f"Unknown graph mode: {self.cuda_graphs_mode}")
 
         return (
             self.state.batched_hyps,
@@ -540,7 +645,9 @@ def _create_inner_while_loop_kernel(cls):
         return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
 
     def _graph_reinitialize(
-        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output_projected: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ):
         batch_size, max_time, encoder_dim = encoder_output_projected.shape
 
@@ -565,12 +672,69 @@ def _graph_reinitialize(
         )
         # to avoid recalculation of joint projection, store decoder output in state
         self.state.decoder_output = self.joint.project_prednet(decoder_output)
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self._full_graph_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self._partial_graphs_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # no graphs needed
+            pass
+        else:
+            raise NotImplementedError
+
+    def _partial_graphs_compile(self):
+        """Compile decoding by parts"""
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
+        self.separate_graphs = SeparateGraphsLoopLabels()
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._before_outer_loop()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._before_inner_loop_get_decoder_output()
+            self._before_inner_loop_get_joint_output()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._inner_loop_code()
+
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
+        ):
+            self._after_inner_loop()
 
+    def _full_graph_compile(self):
+        """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        self.graph = torch.cuda.CUDAGraph()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
+        self.full_graph = torch.cuda.CUDAGraph()
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 
@@ -583,7 +747,8 @@ def _graph_reinitialize(
             outer_loop_kernel = self._create_outer_while_loop_kernel()
             active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
             outer_loop_args = np.array(
-                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data],
+                dtype=np.uint64,
             )
 
             # loop while there are active utterances
@@ -597,7 +762,11 @@ def _graph_reinitialize(
                 (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
                 advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
                 inner_loop_args = np.array(
-                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                    [
+                        inner_loop_conditional_handle.getPtr(),
+                        advance_mask_any_ptr.ctypes.data,
+                    ],
+                    dtype=np.uint64,
                 )
                 # while self.advance_mask_any.item():
 
@@ -651,7 +820,6 @@ def _before_inner_loop_get_joint_output(self):
         # stage 2: get joint output, iteratively seeking for non-blank labels
         # blank label in `labels` tensor means "end of hypothesis" (for this index)
         self.state.active_mask_prev.copy_(self.state.active_mask, non_blocking=True)
-        dtype = self.state.encoder_output_projected.dtype
         logits = (
             self.joint.joint_after_projection(
                 self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
@@ -675,28 +843,33 @@ def _before_inner_loop_get_joint_output(self):
         # for blank labels force duration >= 1
         durations.masked_fill_(torch.logical_and(durations == 0, self.state.blank_mask), 1)
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.active_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=self.state.labels if self.preserve_alignments else None,
-                confidence=torch.stack(
-                    (
+                confidence=(
+                    torch.stack(
+                        (
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                            ).to(dtype=float_dtype),
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
+                            ).to(dtype=float_dtype),
+                        ),
+                        dim=-1,
+                    )
+                    if self.include_duration_confidence
+                    else (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=dtype),
-                        self._get_confidence_tensor(
-                            F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=dtype),
-                    ),
-                    dim=-1,
-                )
-                if self.include_duration_confidence
-                else self._get_confidence_tensor(
-                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=dtype)
-                if self.preserve_frame_confidence
-                else None,
+                        ).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    )
+                ),
             )
 
         # advance_mask is a mask for current batch for searching non-blank labels;
@@ -720,7 +893,6 @@ def _inner_loop_code(self):
             self.state.time_indices_current_labels,
             out=self.state.time_indices_current_labels,
         )
-        dtype = self.state.encoder_output_projected.dtype
         logits = (
             self.joint.joint_after_projection(
                 self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
@@ -742,28 +914,33 @@ def _inner_loop_code(self):
         torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
 
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.advance_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=more_labels if self.preserve_alignments else None,
-                confidence=torch.stack(
-                    (
+                confidence=(
+                    torch.stack(
+                        (
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                            ).to(dtype=float_dtype),
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
+                            ).to(dtype=float_dtype),
+                        ),
+                        dim=-1,
+                    )
+                    if self.include_duration_confidence
+                    else (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=dtype),
-                        self._get_confidence_tensor(
-                            F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=dtype),
-                    ),
-                    dim=-1,
-                )
-                if self.include_duration_confidence
-                else self._get_confidence_tensor(
-                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=dtype)
-                if self.preserve_frame_confidence
-                else None,
+                        ).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    )
+                ),
             )
 
         # blank_mask = self.labels == self._blank_index
@@ -796,7 +973,10 @@ def _after_inner_loop(self):
         )
 
         self.state.batched_hyps.add_results_masked_no_checks_(
-            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+            self.state.active_mask,
+            self.state.labels,
+            self.state.time_indices_current_labels,
+            self.state.scores,
         )
 
         # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -820,9 +1000,11 @@ def _after_inner_loop(self):
         torch.any(self.state.active_mask, out=self.state.active_mask_any)
 
     def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self,
+        x: torch.Tensor,
+        out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
-        if self.use_cuda_graphs and x.device.type == "cuda":
+        if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
 
         return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 71c945b66255..51a46184e66f 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -21,7 +21,6 @@
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader
 
-from nemo.collections.asr.data.audio_to_text_lhotse_prompted import canary_prompt
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
@@ -444,7 +443,10 @@ def _convert_buffer_to_features(self):
         device = self.asr_model.device
         audio_signal = samples.unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([samples.shape[1]]).to(device)
-        features, features_len = self.raw_preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        features, features_len = self.raw_preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         features = features.squeeze()
         self._update_feature_buffer(features[:, -self.feature_chunk_len :])
 
@@ -479,7 +481,10 @@ def __init__(self, samples, frame_len, preprocessor, device, pad_to_frame_len=Tr
         self._feature_frame_len = frame_len / timestep_duration
         audio_signal = torch.from_numpy(self._samples).unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([self._samples.shape[0]]).to(device)
-        self._features, self._features_len = preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        self._features, self._features_len = preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         self._features = self._features.squeeze()
 
     def __iter__(self):
@@ -701,7 +706,12 @@ class for streaming frame-based ASR use reset() method to reset FrameASR's
     """
 
     def __init__(
-        self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, pad_to_buffer_len=True,
+        self,
+        asr_model,
+        frame_len=1.6,
+        total_buffer=4.0,
+        batch_size=4,
+        pad_to_buffer_len=True,
     ):
         '''
         Args:
@@ -1183,7 +1193,9 @@ def _get_batch_preds(self):
         del best_hyp, pred
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         """
         Performs "middle token" alignment prediction using the buffered audio chunk.
@@ -1210,7 +1222,12 @@ def transcribe(
                 ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id)
 
                 if len(ids) > 0 and a_idx < signal_end_idx:
-                    self.unmerged[idx] = inplace_buffer_merge(self.unmerged[idx], ids, delay, model=self.asr_model,)
+                    self.unmerged[idx] = inplace_buffer_merge(
+                        self.unmerged[idx],
+                        ids,
+                        delay,
+                        model=self.asr_model,
+                    )
 
         output = []
         for idx in range(self.batch_size):
@@ -1276,7 +1293,9 @@ def __init__(
         self.alignment_basepath = alignment_basepath
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         if self.lcs_delay < 0:
             raise ValueError(
@@ -1302,7 +1321,10 @@ def transcribe(
 
                     if len(ids) > 0:
                         self.unmerged[idx] = inplace_buffer_merge(
-                            self.unmerged[idx], ids, delay, model=self.asr_model,
+                            self.unmerged[idx],
+                            ids,
+                            delay,
+                            model=self.asr_model,
                         )
 
                 else:
@@ -1588,15 +1610,17 @@ def get_input_tokens(self, sample: dict):
                     f"We found sample that is missing the following keys: {missing_keys}"
                     f"Please ensure that every utterance in the input manifests contains these keys. Sample: {sample}"
                 )
-            tokens = canary_prompt(
-                tokenizer=self.asr_model.tokenizer,
-                text=None,
-                language=None,
-                source_language=sample['source_lang'],
-                target_language=sample['target_lang'],
-                taskname=sample['taskname'],
-                pnc=sample['pnc'],
-            )
+            tokens = self.asr_model.prompt.encode_dialog(
+                turns=[
+                    {
+                        "role": "user",
+                        "slots": {
+                            **sample,
+                            self.asr_model.prompt.PROMPT_LANGUAGE_SLOT: "spl_tokens",
+                        },
+                    }
+                ]
+            )["context_ids"]
         else:
             raise ValueError(f"Unknown prompt format: {self.asr_model.prompt_format}")
         return torch.tensor(tokens, dtype=torch.long, device=self.asr_model.device).unsqueeze(0)  # [1, T]
@@ -1712,12 +1736,16 @@ def _get_batch_preds(self, keep_logits=False):
                 encoded, encoded_len = results
                 log_probs = self.asr_model.ctc_decoder(encoder_output=encoded)
                 transcribed_texts, _ = self.asr_model.ctc_decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
             else:
                 log_probs, encoded_len, predictions = results
                 transcribed_texts, _ = self.asr_model.decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
 
             self.all_preds.extend(transcribed_texts)
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 8465406224e7..c270e5c3a0f7 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -232,7 +232,7 @@ def get_buffered_pred_feat_multitaskAED(
 
 
 def wrap_transcription(hyps: List[str]) -> List[rnnt_utils.Hypothesis]:
-    """ Wrap transcription to the expected format in func write_transcription """
+    """Wrap transcription to the expected format in func write_transcription"""
     wrapped_hyps = []
     for hyp in hyps:
         hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], text=hyp)
@@ -241,7 +241,7 @@ def wrap_transcription(hyps: List[str]) -> List[rnnt_utils.Hypothesis]:
 
 
 def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel, str]:
-    """ Setup model from cfg and return model and model name for next step """
+    """Setup model from cfg and return model and model name for next step"""
     if cfg.model_path is not None and cfg.model_path != "None":
         # restore model from .nemo file path
         model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True)
@@ -249,13 +249,15 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel,
         imported_class = model_utils.import_class_by_path(classpath)  # type: ASRModel
         logging.info(f"Restoring model : {imported_class.__name__}")
         asr_model = imported_class.restore_from(
-            restore_path=cfg.model_path, map_location=map_location,
+            restore_path=cfg.model_path,
+            map_location=map_location,
         )  # type: ASRModel
         model_name = os.path.splitext(os.path.basename(cfg.model_path))[0]
     else:
         # restore model by name
         asr_model = ASRModel.from_pretrained(
-            model_name=cfg.pretrained_name, map_location=map_location,
+            model_name=cfg.pretrained_name,
+            map_location=map_location,
         )  # type: ASRModel
         model_name = cfg.pretrained_name
 
@@ -269,7 +271,7 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel,
 
 
 def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
-    """ Prepare audio data and decide whether it's partial_audio condition. """
+    """Prepare audio data and decide whether it's partial_audio condition."""
     # this part may need refactor alongsides with refactor of transcribe
     partial_audio = False
 
@@ -282,11 +284,20 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
             logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!")
             return None
 
+        audio_key = cfg.get('audio_key', 'audio_filepath')
+
+        with open(cfg.dataset_manifest, "rt") as fh:
+            for line in fh:
+                item = json.loads(line)
+                item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
+                if item.get("duration") is None and cfg.presort_manifest:
+                    raise ValueError(
+                        f"Requested presort_manifest=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
+                    )
         all_entries_have_offset_and_duration = True
         for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
             if not ("offset" in item and "duration" in item):
                 all_entries_have_offset_and_duration = False
-            audio_key = cfg.get('audio_key', 'audio_filepath')
             audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
             filepaths.append(audio_file)
         partial_audio = all_entries_have_offset_and_duration
@@ -322,7 +333,7 @@ def restore_transcription_order(manifest_path: str, transcriptions: list) -> lis
 
 
 def compute_output_filename(cfg: DictConfig, model_name: str) -> DictConfig:
-    """ Compute filename of output manifest and update cfg"""
+    """Compute filename of output manifest and update cfg"""
     if cfg.output_filename is None:
         # create default output filename
         if cfg.audio_dir is not None:
@@ -363,7 +374,7 @@ def write_transcription(
     compute_langs: bool = False,
     compute_timestamps: bool = False,
 ) -> Tuple[str, str]:
-    """ Write generated transcription to output file. """
+    """Write generated transcription to output file."""
     if cfg.append_pred:
         logging.info(f'Transcripts will be written in "{cfg.output_filename}" file')
         if cfg.pred_name_postfix is not None:
@@ -533,7 +544,11 @@ def transcribe_partial_audio(
                     lg = logits[idx][: logits_len[idx]]
                     hypotheses.append(lg)
             else:
-                current_hypotheses, _ = decode_function(logits, logits_len, return_hypotheses=return_hypotheses,)
+                current_hypotheses, _ = decode_function(
+                    logits,
+                    logits_len,
+                    return_hypotheses=return_hypotheses,
+                )
 
                 if return_hypotheses:
                     # dump log probs per file
@@ -567,10 +582,9 @@ def compute_metrics_per_sample(
     punctuation_marks: List[str] = [".", ",", "?"],
     output_manifest_path: str = None,
 ) -> dict:
-
     '''
     Computes metrics per sample for given manifest
-    
+
     Args:
         manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format)
         reference_field: str, Optional - name of field in .json manifest with the reference text ("text" by default).
@@ -578,7 +592,7 @@ def compute_metrics_per_sample(
         metrics: list[str], Optional - list of metrics to be computed (currently supported "wer", "cer", "punct_er")
         punctuation_marks: list[str], Optional - list of punctuation marks for computing punctuation error rate ([".", ",", "?"] by default).
         output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved.
-    
+
     Returns:
         samples: dict - Dict of samples with calculated metrics
     '''
diff --git a/nemo/collections/common/data/dataset.py b/nemo/collections/common/data/dataset.py
index c2c29b54f7f6..71220dd9d5f2 100644
--- a/nemo/collections/common/data/dataset.py
+++ b/nemo/collections/common/data/dataset.py
@@ -26,12 +26,12 @@
 
 class ConcatDataset(IterableDataset):
     """
-    A dataset that accepts as argument multiple datasets and then samples from them based on the specified 
+    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
     sampling technique.
 
     Args:
         datasets (list): A list of datasets to sample from.
-        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. 
+        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
             Defaults to True.
         sampling_technique (str): Sampling technique to choose which dataset to draw a sample from.
             Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'.
@@ -73,7 +73,9 @@ def __init__(
             self.sampling_kwargs['seed'] = seed
         elif sampling_technique == 'random':
             self.index_generator = ConcatDataset.random_generator
-            self.sampling_kwargs['p'] = sampling_probabilities
+            self.sampling_kwargs['p'] = (
+                sampling_probabilities if sampling_probabilities else [1 / len(datasets)] * len(datasets)
+            )
             self.sampling_kwargs['seed'] = seed
         elif sampling_technique == 'round-robin':
             self.index_generator = ConcatDataset.round_robin_generator
@@ -200,7 +202,7 @@ def random_generator(datasets, **kwargs):
 
 class ConcatMapDataset(Dataset):
     """
-    A dataset that accepts as argument multiple datasets and then samples from them based on the specified 
+    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
     sampling technique.
 
     Args:
@@ -300,7 +302,7 @@ class CodeSwitchedDataset(IterableDataset):
     Args:
         datasets (list): A list of datasets
         lang_probs (list): A list of probabilities (which must sum to 1) corresponding to the sampling probability for each dataset
-        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. 
+        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
             Defaults to True.
         min_duration (int): the minimum duration (secs) of each synthetic code-switched sample. Will draw randomly until this is hit.
             Defaults to 4
@@ -535,7 +537,7 @@ def build_single_CS_sample(self):
                 wav = np.trim_zeros(wav)
 
             # normalise to provided DB level
-            wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav ** 2).mean(axis=0) ** 0.5))
+            wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav**2).mean(axis=0) ** 0.5))
 
             # this part appends the normed waveform to the existing waveform, and inserts pause_join amount of silence
             # if necessary, otherwise just a straight append
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 0ed0f67beaaa..775395400d8e 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 from typing import Sequence, Tuple, Union
 
+import omegaconf
 from lhotse import CutSet, Features, Recording
 from lhotse.array import Array, TemporalArray
 from lhotse.cut import Cut, MixedCut, PaddingCut
@@ -126,7 +127,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "shard_seed": config.shard_seed,
         "text_field": config.text_field,
         "lang_field": config.lang_field,
-        "missing_sampling_rate_ok": config.missing_sampling_rate_ok,
+        "metadata_only": config.metadata_only,
         "max_open_streams": config.max_open_streams,
     }
     input_cfg = config.input_cfg
@@ -163,7 +164,10 @@ def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]:
         is_tarred = True
         cuts = read_txt_pair_paths(grp_cfg)
     elif grp_cfg.type == "group":
-        cuts, is_tarred = parse_and_combine_datasets(grp_cfg.input_cfg, propagate_attrs=propagate_attrs,)
+        cuts, is_tarred = parse_and_combine_datasets(
+            grp_cfg.input_cfg,
+            propagate_attrs=propagate_attrs,
+        )
     else:
         raise ValueError(f"Unrecognized group: {grp_cfg.type}")
     # Attach extra tags to every utterance dynamically, if provided.
@@ -175,7 +179,10 @@ def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]:
 def read_txt_paths(config: DictConfig) -> CutSet:
     return CutSet(
         LhotseTextAdapter(
-            paths=config.paths, language=config.language, shuffle_shards=config.shuffle, shard_seed=config.shard_seed,
+            paths=config.paths,
+            language=config.language,
+            shuffle_shards=config.shuffle,
+            shard_seed=config.shard_seed,
         )
     ).repeat()
 
@@ -237,6 +244,7 @@ def parse_and_combine_datasets(
             weights=weights if weights else None,
             max_open_streams=propagate_attrs["max_open_streams"],
             seed=propagate_attrs["shard_seed"],
+            metadata_only=propagate_attrs["metadata_only"],
         )
     else:
         (cuts,) = cuts
@@ -260,11 +268,16 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
         # - integer means we'll set a specific seed in every worker, and data would be duplicated across them.
         #   This is mostly useful for unit testing or debugging.
         shard_seed = config.shard_seed
+        metadata_only = config.metadata_only
         if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
             logging.info(f"Initializing Lhotse Shar CutSet (tarred) from a single data source: '{config.shar_path}'")
-            cuts = CutSet.from_shar(in_dir=config.shar_path, shuffle_shards=True, seed=shard_seed).repeat()
+            cuts = CutSet.from_shar(
+                **_resolve_shar_inputs(config.shar_path, metadata_only), shuffle_shards=True, seed=shard_seed
+            )
+            if not metadata_only:
+                cuts = cuts.repeat()
         else:
             # Multiple datasets in Lhotse Shar format: we will dynamically multiplex them
             # with probability approximately proportional to their size
@@ -277,7 +290,9 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
             for item in config.shar_path:
                 if isinstance(item, (str, Path)):
                     path = item
-                    cs = CutSet.from_shar(in_dir=path, shuffle_shards=True, seed=shard_seed)
+                    cs = CutSet.from_shar(
+                        **_resolve_shar_inputs(path, metadata_only), shuffle_shards=True, seed=shard_seed
+                    )
                     weight = len(cs)
                 else:
                     assert isinstance(item, Sequence) and len(item) == 2 and isinstance(item[1], (int, float)), (
@@ -287,11 +302,19 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
                         f"We got: '{item}'"
                     )
                     path, weight = item
-                    cs = CutSet.from_shar(in_dir=path, shuffle_shards=True, seed=shard_seed)
+                    cs = CutSet.from_shar(
+                        **_resolve_shar_inputs(path, metadata_only), shuffle_shards=True, seed=shard_seed
+                    )
                 logging.info(f"- {path=} {weight=}")
-                cutsets.append(cs.repeat())
+                cutsets.append(cs)
                 weights.append(weight)
-            cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed)
+            cuts = mux(
+                *cutsets,
+                weights=weights,
+                max_open_streams=config.max_open_streams,
+                seed=config.shard_seed,
+                metadata_only=metadata_only,
+            )
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
@@ -299,6 +322,13 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
     return cuts
 
 
+def _resolve_shar_inputs(path: str | Path, only_metadata: bool) -> dict:
+    if only_metadata:
+        return dict(fields={"cuts": sorted(Path(path).glob("cuts.*"))})
+    else:
+        return dict(in_dir=path)
+
+
 def resolve_relative_paths(cut: Cut, manifest_path: str) -> Cut:
     if isinstance(cut, PaddingCut):
         return cut
@@ -351,20 +381,24 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
     common_kwargs = {
         "text_field": config.text_field,
         "lang_field": config.lang_field,
+        "shuffle_shards": config.shuffle,
+        "shard_seed": config.shard_seed,
     }
     # The option below is to allow a special case of NeMo manifest iteration as Lhotse CutSet
-    # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse.
-    # This is useful for utility scripts that iterate metadata and estimate optimal batching settings.
-    notar_kwargs = {"missing_sampling_rate_ok": config.missing_sampling_rate_ok}
+    # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse,
+    # so lhotse has to look up the headers of audio files to fill it on-the-fly.
+    # (this only has an impact on non-tarred data; tarred data is read into memory anyway).
+    # This is useful for utility scripts that iterate metadata and estimate optimal batching settings
+    # and other data statistics.
+    notar_kwargs = {"metadata_only": config.metadata_only}
+    metadata_only = config.metadata_only
     if isinstance(config.manifest_filepath, (str, Path)):
         logging.info(f"Initializing Lhotse CutSet from a single NeMo manifest (tarred): '{config.manifest_filepath}'")
-        if is_tarred:
+        if is_tarred and not metadata_only:
             cuts = CutSet(
                 LazyNeMoTarredIterator(
                     config.manifest_filepath,
                     tar_paths=config.tarred_audio_filepaths,
-                    shuffle_shards=config.shuffle,
-                    shard_seed=config.shard_seed,
                     **common_kwargs,
                 )
             ).repeat()
@@ -392,12 +426,10 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
         for manifest_info, (tar_path,) in zip(config.manifest_filepath, tar_paths):
             # First, convert manifest_path[+tar_path] to an iterator.
             manifest_path = manifest_info[0]
-            if is_tarred:
+            if is_tarred and not metadata_only:
                 nemo_iter = LazyNeMoTarredIterator(
                     manifest_path=manifest_path,
                     tar_paths=tar_path,
-                    shuffle_shards=config.shuffle,
-                    shard_seed=config.shard_seed,
                     **common_kwargs,
                 )
             else:
@@ -430,12 +462,22 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                 cutsets.append(CutSet(nemo_iter))
                 weights.append(weight)
         # Finally, we multiplex the dataset streams to mix the data.
-        cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed)
+        cuts = mux(
+            *cutsets,
+            weights=weights,
+            max_open_streams=config.max_open_streams,
+            seed=config.shard_seed,
+            metadata_only=metadata_only,
+        )
     return cuts
 
 
 def mux(
-    *cutsets: CutSet, weights: list[int | float], max_open_streams: int | None = None, seed: str | int = "trng"
+    *cutsets: CutSet,
+    weights: list[int | float],
+    max_open_streams: int | None = None,
+    seed: str | int = "trng",
+    metadata_only: bool = False,
 ) -> CutSet:
     """
     Helper function to call the right multiplexing method flavour in lhotse.
@@ -443,7 +485,47 @@ def mux(
     it will select a more appropriate multiplexing strategy.
     """
     if max_open_streams is not None:
+        assert not metadata_only, "max_open_streams and metadata_only options are not compatible"
         cuts = CutSet.infinite_mux(*cutsets, weights=weights, seed=seed, max_open_streams=max_open_streams)
     else:
-        cuts = CutSet.mux(*[cs.repeat() for cs in cutsets], weights=weights, seed=seed)
+        if not metadata_only:
+            cutsets = [cs.repeat() for cs in cutsets]
+        cuts = CutSet.mux(*cutsets, weights=weights, seed=seed)
     return cuts
+
+
+def guess_parse_cutset(inp: Union[str, dict, omegaconf.DictConfig]) -> CutSet:
+    """
+    Utility function that supports opening a CutSet from:
+    * a string path to YAML input spec (see :func:`read_dataset_config` for details)
+    * a string path to Lhotse non-tarred JSONL manifest
+    * a string path to NeMo non-tarred JSON manifest
+    * a dictionary specifying inputs with keys available in :class:`nemo.collections.common.data.lhotse.dataloader.LhotseDataLoadingConfig`
+
+    It's intended to be used in a generic context where we are not sure which way the user will specify the inputs.
+    """
+    from nemo.collections.common.data.lhotse.dataloader import make_structured_with_schema_warnings
+
+    if isinstance(inp, (dict, omegaconf.DictConfig)):
+        try:
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"{k}={v}" for k, v in inp.items()]))
+            cuts, _ = read_cutset_from_config(config)
+            return cuts
+        except Exception as e:
+            raise RuntimeError(
+                f"Couldn't open CutSet based on dict input {inp} (is it compatible with LhotseDataLoadingConfig?)"
+            ) from e
+    elif isinstance(inp, str):
+        if inp.endswith(".yaml"):
+            # Path to YAML file with the input configuration
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"input_cfg={inp}"]))
+        elif inp.endswith(".jsonl") or inp.endswith(".jsonl.gz"):
+            # Path to a Lhotse non-tarred manifest
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"cuts_path={inp}"]))
+        else:
+            # Assume anything else is a NeMo non-tarred manifest
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"manifest_filepath={inp}"]))
+        cuts, _ = read_cutset_from_config(config)
+        return cuts
+    else:
+        raise RuntimeError(f'Unsupported input type: {type(inp)} (expected a dict or a string)')
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5bb3bf2988ea..01bf51b0e2c6 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 import warnings
 from dataclasses import dataclass
 from functools import partial
@@ -19,7 +19,7 @@
 
 import numpy as np
 import torch
-from lhotse import CutSet
+from lhotse import CutSet, RecordingSet
 from lhotse.cut import Cut
 from lhotse.cut.text import TextExample, TextPairExample
 from lhotse.dataset import (
@@ -27,6 +27,7 @@
     DynamicBucketingSampler,
     DynamicCutSampler,
     IterableDatasetWrapper,
+    ReverbWithImpulseResponse,
     make_worker_init_fn,
 )
 from lhotse.dataset.dataloading import resolve_seed
@@ -35,7 +36,7 @@
 from lhotse.utils import fastcopy, fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
-from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
+from nemo.collections.common.data.lhotse.cutset import guess_parse_cutset, read_cutset_from_config
 from nemo.utils import logging
 
 
@@ -74,6 +75,7 @@ class LhotseDataLoadingConfig:
     drop_last: bool = False
     shard_seed: int | str = "trng"
     max_open_streams: int | None = None
+    cuda_expandable_segments: bool = True
 
     # 2.1 Multimodal sampling override options
     use_multimodal_sampling: bool = False
@@ -89,10 +91,13 @@ class LhotseDataLoadingConfig:
     seed: int | str = 0
     num_workers: int = 0
     pin_memory: bool = False
+    channel_selector: int | str | None = None
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
-    noise_path: str | None = None
+    noise_path: Any | None = (
+        None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    )
     noise_snr: tuple[float, float] = (10.0, 20.0)
     noise_mix_prob: float = 0.5
     #   b. On-the-fly 3-way speed perturbation.
@@ -111,18 +116,29 @@ class LhotseDataLoadingConfig:
     cut_into_windows_duration: Optional[float] = None  # set this to enable
     cut_into_windows_hop: Optional[float] = None
     #       III) common options
-    keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    keep_excessive_supervisions: bool = (
+        True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    )
+    #   e. RIR augmentation (synthetic RIR if rir_path is None)
+    #   at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
+    rir_enabled: bool = False
+    rir_path: str | None = None  # str, must point to a lhotse RecordingSet manifest
+    rir_prob: float = 0.5
 
     # 5. Other Lhotse options.
     text_field: str = "text"  # key to read the transcript from
     lang_field: str = "lang"  # key to read the language tag from
     # Enables iteration of NeMo non-tarred manifests that don't have a "sampling_rate" key without performing any I/O.
     # Note that this will not allow actual dataloading; it's only for manifest iteration as Lhotse objects.
-    missing_sampling_rate_ok: bool = False
+    metadata_only: bool = False
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
+    config: DictConfig,
+    global_rank: int,
+    world_size: int,
+    dataset: torch.utils.data.Dataset,
+    tokenizer=None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -149,6 +165,8 @@ def get_lhotse_dataloader_from_config(
 
     config = make_structured_with_schema_warnings(config)
 
+    maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)
+
     # First, resolve the random seed in case a string value was provided.
     seed = resolve_seed(config.seed)
     fix_random_seed(seed)
@@ -156,6 +174,11 @@ def get_lhotse_dataloader_from_config(
     # 1. Load a manifest as a Lhotse CutSet.
     cuts, is_tarred = read_cutset_from_config(config)
 
+    # Apply channel selector
+    if config.channel_selector is not None:
+        logging.info('Using channel selector %s.', config.channel_selector)
+        cuts = cuts.map(partial(_select_channel, channel_selector=config.channel_selector))
+
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.resample(config.sample_rate)
 
@@ -176,10 +199,10 @@ def get_lhotse_dataloader_from_config(
     # 2. Optional augmentations.
     # 2.a. Noise mixing.
     if config.noise_path is not None:
-        noise = CutSet.from_file(config.noise_path)
+        noise = guess_parse_cutset(config.noise_path)
         cuts = cuts.mix(
             cuts=noise,
-            snr=config.noise_snr,
+            snr=tuple(config.noise_snr),
             mix_prob=config.noise_mix_prob,
             seed=config.shard_seed,
             random_mix_offset=True,
@@ -190,7 +213,11 @@ def get_lhotse_dataloader_from_config(
     #    and applying it here (before sampler/dataset) ensures optimal
     #    bucket allocation.
     if config.perturb_speed:
-        cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
+        cuts = CutSet.mux(
+            cuts,
+            cuts.perturb_speed(0.9),
+            cuts.perturb_speed(1.1),
+        )
 
     # 2.d: truncation/slicing
     if config.truncate_duration is not None:
@@ -234,6 +261,7 @@ def get_lhotse_dataloader_from_config(
             f"Creating a Lhotse DynamicBucketingSampler "
             f"(max_batch_duration={config.batch_duration} max_batch_size={config.batch_size})"
         )
+        # Determine the bucket duration bins
         sampler = DynamicBucketingSampler(
             cuts,
             constraint=constraint,
@@ -242,7 +270,7 @@ def get_lhotse_dataloader_from_config(
             shuffle_buffer_size=config.shuffle_buffer_size,
             seed=config.shard_seed,
             num_buckets=config.num_buckets,
-            duration_bins=config.bucket_duration_bins,
+            duration_bins=determine_bucket_duration_bins(config),
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
             buffer_size=config.bucket_buffer_size,
             rank=0 if is_tarred else global_rank,
@@ -276,13 +304,24 @@ def get_lhotse_dataloader_from_config(
         # object with texts joined by a whitespace so that "regular" dataset classes don't
         # have to add a special support for multi-supervision cuts.
         sampler = sampler.map(
-            CutConcatenate(gap=config.concatenate_gap_seconds, duration_factor=config.concatenate_duration_factor,)
+            CutConcatenate(
+                gap=config.concatenate_gap_seconds,
+                duration_factor=config.concatenate_duration_factor,
+            )
         )
         if config.db_norm is not None:
             sampler = sampler.map(partial(_normalize_loudness, db_norm=config.db_norm))
         if config.concatenate_merge_supervisions:
             sampler = sampler.map(_merge_supervisions)
 
+    if config.rir_enabled:
+        sampler = sampler.map(
+            ReverbWithImpulseResponse(
+                rir_recordings=RecordingSet.from_file(config.rir_path) if config.rir_path is not None else None,
+                p=config.rir_prob,
+            )
+        )
+
     # 4. Creating dataloader.
     if is_tarred:
         # Wrapper here is necessary when using NeMo tarred data or Lhotse Shar data,
@@ -303,12 +342,38 @@ def get_lhotse_dataloader_from_config(
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = torch.utils.data.DataLoader(
-        **dloader_kwargs, batch_size=None, num_workers=config.num_workers, pin_memory=config.pin_memory,
+        **dloader_kwargs,
+        batch_size=None,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
     )
 
     return dloader
 
 
+def determine_bucket_duration_bins(config):
+    if config.bucket_duration_bins is not None:
+        # Bucket duration bins are provided: just use them.
+        return config.bucket_duration_bins
+    # Bucket duration bins are not set.
+    if config.use_multimodal_sampling:
+        # For multimodal sampling it's currently impossible to define a linspace over durations
+        # because the buckets are counted in the number of tokens.
+        # The bins will be auto-estimated by lhotse at the cost of a slight lag in the training start.
+        return None
+    elif config.max_duration is not None and config.max_duration < float("inf"):
+        # If max duration is provided, we can use that to compute uniformly distant bucket bins.
+        # This is not optimal but should be close enough for users who didn't want to estimate these up-front.
+        begin = config.min_duration if config.min_duration is not None and config.min_duration > 0 else 0.0
+        end = config.max_duration
+        return np.linspace(begin, end, config.num_buckets + 1)[1:-1].tolist()
+    else:
+        # If we don't know max_duration, we can't guess a reasonable estimate of the upper bound of
+        # durations.
+        # The bins will be auto-estimated by lhotse at the cost of a slight lag in the training start.
+        return None
+
+
 def make_structured_with_schema_warnings(config: DictConfig) -> DictConfig:
     """
     Checks the schema and fills missing default option values.
@@ -354,7 +419,9 @@ class MultimodalSamplingConstraint(SamplingConstraint):
 
     def __post_init__(self):
         self._internal = TokenConstraint(
-            max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
+            max_tokens=self.batch_tokens,
+            max_examples=self.batch_size,
+            quadratic_length=self.quadratic_factor,
         )
 
     def add(self, example: Any) -> None:
@@ -443,3 +510,53 @@ def _flatten_alt_text(cut) -> list:
         text_instance.custom = {"text": data.pop("text"), "lang": data.pop("lang"), **data}
         ans.append(text_instance)
     return ans
+
+
+def maybe_set_cuda_expandable_segments(enabled: bool):
+    """
+    Configures PyTorch memory allocator to expand existing allocated segments
+    instead of re-allocating them when tensor shape grows.
+    This can help speed up the training when sequence length and/or batch size change often,
+    and makes GPU more robust towards OOM.
+
+    See here for more details:
+    https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf
+    """
+    if enabled and torch.cuda.is_available():
+        if (
+            (value := os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) is not None
+            and len(value) > 0
+            and "expandable_segments:True" not in value
+        ):
+            warnings.warn(
+                "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
+            )
+
+        try:
+            torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        except RuntimeError:
+            logging.info(
+                "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
+            )
+
+
+def _select_channel(cut, channel_selector: int | str) -> list:
+    if isinstance(channel_selector, int):
+        channel_idx = channel_selector
+    elif isinstance(channel_selector, str):
+        if channel_selector in cut.custom:
+            channel_idx = cut.custom[channel_selector]
+        else:
+            raise ValueError(f"Channel selector {channel_selector} not found in cut.custom")
+
+    if channel_idx >= cut.num_channels:
+        raise ValueError(
+            f"Channel index {channel_idx} is larger than the actual number of channels {cut.num_channels}"
+        )
+
+    if cut.num_channels == 1:
+        # one channel available and channel_idx==0
+        return cut
+    else:
+        # with_channels only defined on MultiCut
+        return cut.with_channels(channel_idx)
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 02b3e1f4edda..d24ce794da5a 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import random
 import re
-import secrets
 import tarfile
 from io import BytesIO
 from pathlib import Path
@@ -50,7 +50,7 @@ class LazyNeMoIterator:
 
     .. caution:: We will perform some I/O (as much as required by soundfile.info) to discover the sampling rate
         of the audio file. If this is not acceptable, convert the manifest to Lhotse format which contains
-        sampling rate info. For pure metadata iteration purposes we also provide a ``missing_sampling_rate_ok`` flag that
+        sampling rate info. For pure metadata iteration purposes we also provide a ``metadata_only`` flag that
         will create only partially valid Lhotse objects (with metadata related to sampling rate / num samples missing).
 
     Example::
@@ -63,16 +63,23 @@ def __init__(
         path: str | Path,
         text_field: str = "text",
         lang_field: str = "lang",
-        missing_sampling_rate_ok: bool = False,
+        metadata_only: bool = False,
+        shuffle_shards: bool = False,
+        shard_seed: int | Literal["randomized", "trng"] = "trng",
     ) -> None:
-        self.source = LazyJsonlIterator(path)
+        self.path = path
+        self.shuffle_shards = shuffle_shards
+        self.shard_seed = shard_seed
+        paths = expand_sharded_filepaths(path)
+        if len(paths) == 1:
+            self.source = LazyJsonlIterator(paths[0])
+        else:
+            self.source = LazyIteratorChain(
+                *(LazyJsonlIterator(p) for p in paths), shuffle_iters=self.shuffle_shards, seed=self.shard_seed
+            )
         self.text_field = text_field
         self.lang_field = lang_field
-        self.missing_sampling_rate_ok = missing_sampling_rate_ok
-
-    @property
-    def path(self) -> str | Path:
-        return self.source.path
+        self.metadata_only = metadata_only
 
     def __iter__(self) -> Generator[Cut, None, None]:
         for data in self.source:
@@ -105,7 +112,12 @@ def __len__(self) -> int:
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
-    def _create_recording(self, audio_path: str, duration: float, sampling_rate: int | None = None,) -> Recording:
+    def _create_recording(
+        self,
+        audio_path: str,
+        duration: float,
+        sampling_rate: int | None = None,
+    ) -> Recording:
         if sampling_rate is not None:
             # TODO(pzelasko): It will only work with single-channel audio in the current shape.
             return Recording(
@@ -116,7 +128,7 @@ def _create_recording(self, audio_path: str, duration: float, sampling_rate: int
                 duration=duration,
                 channel_ids=[0],
             )
-        elif self.missing_sampling_rate_ok:
+        elif self.metadata_only:
             return Recording(
                 id=audio_path,
                 sources=[AudioSource(type="file", channels=[0], source=audio_path)],
@@ -147,9 +159,20 @@ class LazyNeMoTarredIterator:
 
     Args ``manifest_path`` and ``tar_paths`` can be either a path/string to a single file, or a string in NeMo format
     that indicates multiple paths (e.g. "[[data/bucket0/tarred_audio_paths.json],[data/bucket1/...]]").
+    We discover shard ids from sharded tar and json files by parsing the input specifier/path and
+    searching for the following pattern: ``(manifest|audio)[^/]*_(\d+)[^/]*\.(json|tar)``.
+    It allows filenames such as ``manifest_0.json``, ``manifest_0_normalized.json``, ``manifest_normalized_0.json``,
+    ``manifest_0.jsonl.gz``, etc. (anologusly the same applies to tar files).
+
+    We also support generalized input specifiers that imitate webdataset's pipes (also very similar to Kaldi's pipes).
+    These are arbitrary shell commands to be lazily executed which yield manifest or tar audio contents.
+    For example, ``tar_paths`` can be set to ``pipe:ais get ais://my-bucket/audio_{0..127}.tar -``
+    to indicate that we want to read tarred audio data from shards on an AIStore bucket.
+    This can be used for other cloud storage APIs such as S3, GCS, etc.
+    The same mechanism applies to ``manifest_path``.
 
     The ``shard_seed`` argument is used to seed the RNG shuffling the shards.
-    By default it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module).
+    By default, it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module).
     Seed is resolved lazily so that every dataloading worker may sample a different one.
     Override with an integer value for deterministic behaviour and consult Lhotse documentation for details:
     https://lhotse.readthedocs.io/en/latest/datasets.html#handling-random-seeds
@@ -172,30 +195,42 @@ def __init__(
         text_field: str = "text",
         lang_field: str = "lang",
     ) -> None:
-        def strip_pipe(p):
-            if isinstance(p, str):
-                if p.startswith("pipe:"):
-                    p = p[5:]
-                return Path(p)
-            return p
-
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
+            logging.warning(
+                f"""You are using Lhotse dataloading for tarred audio with a non-sharded manifest.
+                            This will incur significant memory overhead and slow-down training. To prevent this error message
+                            please shard file '{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py'
+                            WITHOUT '--no_shard_manifest'"""
+            )
             self.source = LazyJsonlIterator(self.paths[0])
             self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
-            pattern = re.compile(r".+_(\d+)\.jsonl?(?:.gz)?")
+            json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
             shard_ids = []
             for p in self.paths:
-                m = pattern.match(p)
-                assert m is not None, f"Cannot determine shard_id from manifest path: {p}"
+                m = json_pattern.search(p)
+                assert m is not None, (
+                    f"Cannot determine shard_id from manifest input specified: "
+                    f"we searched with regex '{json_pattern.pattern}' in input '{p}'"
+                )
                 shard_ids.append(int(m.group(1)))
             self.shard_id_to_manifest = {sid: LazyJsonlIterator(p) for sid, p in zip(shard_ids, self.paths)}
             self.source = LazyIteratorChain(*self.shard_id_to_manifest.values())
 
-        tar_paths = expand_sharded_filepaths(tar_paths)
-        self.shard_id_to_tar_path: dict[int, str] = {int(strip_pipe(p).stem.split("_")[1]): p for p in tar_paths}
+        self.tar_paths = expand_sharded_filepaths(tar_paths)
+        tar_pattern = re.compile(r"audio[^/]*_(\d+)[^/]*\.tar")
+        shard_ids = []
+        for p in self.tar_paths:
+            m = tar_pattern.search(p)
+            assert m is not None, (
+                f"Cannot determine shard_id from tar input specifier: "
+                f"we searched with regex '{tar_pattern.pattern}' in input '{p}'"
+            )
+            shard_ids.append(int(m.group(1)))
+        self.shard_id_to_tar_path = dict(zip(shard_ids, self.tar_paths))
+
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
         self.text_field = text_field
@@ -225,8 +260,11 @@ def _validate(self) -> None:
         shard_ids_tars = set(self.shard_id_to_tar_path)
         shard_ids_manifest = set(self.shard_id_to_manifest)
         assert shard_ids_tars == shard_ids_manifest, (
-            f"Mismatch between shard IDs discovered from tar files ({len(shard_ids_tars)=}) and "
-            f"JSON manifest ({len(shard_ids_manifest)=}): {shard_ids_tars - shard_ids_manifest=}"
+            f"Mismatch between shard IDs. Details:\n"
+            f"* JSON manifest(s) {self.paths}\n"
+            f"* Tar files: {self.tar_paths}\n"
+            f"* JSON manifest(s) indicate(s) IDs: {sorted(shard_ids_manifest)}\n"
+            f"* Tar path(s) indicate(s) IDs: {sorted(shard_ids_tars)}\n"
         )
 
     @property
@@ -241,13 +279,16 @@ def __iter__(self) -> Generator[Cut, None, None]:
             random.Random(seed).shuffle(shard_ids)
 
         for sid in shard_ids:
-            shard_manifest = self.shard_id_to_manifest[sid]
+            manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
+            shard_manifest = {data["audio_filepath"]: data for data in self.shard_id_to_manifest[sid]}
             tar_path = self.shard_id_to_tar_path[sid]
             with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
-                for data, tar_info in zip(shard_manifest, tar):
-                    assert (
-                        data["audio_filepath"] == tar_info.name
-                    ), f"Mismatched JSON manifest and tar file. {data['audio_filepath']=} != {tar_info.name=}"
+                for tar_info in tar:
+                    assert tar_info.name in shard_manifest, (
+                        f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
+                        f"Cannot locate JSON entry for tar file '{tar_info.name}'"
+                    )
+                    data = shard_manifest[tar_info.name]
                     raw_audio = tar.extractfile(tar_info).read()
                     # Note: Lhotse has a Recording.from_bytes() utility that we won't use here because
                     #       the profiling indicated significant overhead in torchaudio ffmpeg integration
diff --git a/nemo/collections/common/metrics/__init__.py b/nemo/collections/common/metrics/__init__.py
index 322e62214ead..9e21d93816a9 100644
--- a/nemo/collections/common/metrics/__init__.py
+++ b/nemo/collections/common/metrics/__init__.py
@@ -14,5 +14,9 @@
 
 from nemo.collections.common.metrics.classification_accuracy import TopKClassificationAccuracy
 from nemo.collections.common.metrics.global_average_loss_metric import GlobalAverageLossMetric
-from nemo.collections.common.metrics.metric_string_to_torchmetric import MetricStringToTorchMetric
+from nemo.collections.common.metrics.metric_string_to_torchmetric import (
+    ClassificationMetricsSet,
+    MetricStringToTorchMetric,
+    TextMetricsSet,
+)
 from nemo.collections.common.metrics.perplexity import Perplexity
diff --git a/nemo/collections/common/metrics/metric_string_to_torchmetric.py b/nemo/collections/common/metrics/metric_string_to_torchmetric.py
index b38047b576cc..f91c915309f2 100644
--- a/nemo/collections/common/metrics/metric_string_to_torchmetric.py
+++ b/nemo/collections/common/metrics/metric_string_to_torchmetric.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 from torchmetrics import Accuracy, AveragePrecision, F1Score, MatthewsCorrCoef, PearsonCorrCoef, SpearmanCorrCoef
+from torchmetrics.text import SacreBLEUScore
 from torchmetrics.text.rouge import ROUGEScore
+from torchmetrics.text.wer import WordErrorRate
 
 from nemo.collections.common.metrics.classification_accuracy import ExactStringMatchMetric, TokenF1Score
 
-__all__ = ['MetricStringToTorchMetric']
+__all__ = ['MetricStringToTorchMetric', 'TextMetricsSet', 'ClassificationMetricsSet']
 
 # Dictionary that maps a metric string name to its corresponding torchmetric class.
 
@@ -31,4 +33,10 @@
     'matthews_corr_coef': MatthewsCorrCoef,
     'exact_string_match': ExactStringMatchMetric,
     'rouge': ROUGEScore,
+    'wer': WordErrorRate,
+    'bleu': SacreBLEUScore,
 }
+
+TextMetricsSet = set(['rouge', 'wer', 'bleu'])
+
+ClassificationMetricsSet = set(['accuracy', 'average_precision', 'f1', 'exact_string_match'])
diff --git a/nemo/collections/common/parts/optional_cuda_graphs.py b/nemo/collections/common/parts/optional_cuda_graphs.py
new file mode 100644
index 000000000000..2417d9e00370
--- /dev/null
+++ b/nemo/collections/common/parts/optional_cuda_graphs.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from typing import Optional
+
+import torch.nn as nn
+
+from nemo.utils import logging
+
+
+class WithOptionalCudaGraphs(abc.ABC):
+    """
+    Abstract interface for modules with CUDA graphs.
+    Allows to enable/disable CUDA graphs on the fly.
+    """
+
+    @classmethod
+    def disable_cuda_graphs_recursive(cls, module: nn.Module, attribute_path: Optional[str] = None):
+        """
+        Disable CUDA graphs Enable CUDA graphs, finding submodule recursively.
+
+        Args:
+            module: instance of nn.Module
+            attribute_path: field containing instance of WithOptionalCudaGraphs
+                   E.g., "decoding.decoding" means that "<module>.decoding.decoding" are checked.
+                   If None, "<module>" is checked.
+        """
+        attributes = attribute_path.split(".") if attribute_path else []
+
+        for name, submodule in module.named_modules():
+            object_to_check = submodule
+            try:
+                # recursively get attribute by iterating attribute_path
+                for attribute in attributes:
+                    object_to_check = getattr(object_to_check, attribute)
+            except AttributeError:
+                continue  # loop over modules, no attribute
+
+            if isinstance(object_to_check, cls):
+                object_to_check.disable_cuda_graphs()
+                logging.info(f"Disabled CUDA graphs for module {type(submodule)}" + ".".join([name] + attributes))
+
+    @classmethod
+    def enable_cuda_graphs_recursive(cls, module: nn.Module, attribute_path: Optional[str] = None):
+        """
+        Enable CUDA graphs, finding submodule recursively
+
+        Args:
+            module: instance of nn.Module
+            attribute_path: field containing instance of WithOptionalCudaGraphs
+                   E.g., "decoding.decoding" means that "<module>.decoding.decoding" are checked.
+                   If None, "<module>" is checked.
+        """
+        attributes = attribute_path.split(".") if attribute_path else []
+
+        for name, submodule in module.named_modules():
+            object_to_check = submodule
+            try:
+                # recursively get attribute by iterating attribute_path
+                for attribute in attributes:
+                    object_to_check = getattr(object_to_check, attribute)
+            except AttributeError:
+                continue  # loop over modules, no attribute
+
+            if isinstance(object_to_check, cls):
+                object_to_check.maybe_enable_cuda_graphs()
+                logging.info(f"Enabled CUDA graphs for module {type(submodule)}" + ".".join([name] + attributes))
+
+    @abc.abstractmethod
+    def disable_cuda_graphs(self):
+        """Disable (maybe temporary) CUDA graphs"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if all conditions met"""
+        raise NotImplementedError
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index 66def034400f..24ca6cffe458 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -17,11 +17,11 @@
 import os
 from itertools import combinations
 from typing import Any, Dict, Iterable, List, Optional, Union
-
+import numpy as np
 import pandas as pd
 
 from nemo.collections.common.parts.preprocessing import manifest, parsers
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 
 class _Collection(collections.UserList):
@@ -320,7 +320,13 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `AudioText` constructor.
         """
 
-        ids, audio_files, durations, texts, offsets, = (
+        (
+            ids,
+            audio_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
@@ -343,6 +349,19 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
+class SpeechLLMAudioTextEntity(object):
+    def __init__(self, sid, audio_file, duration, context, answer, offset, speaker, orig_sr, lang) -> None:
+        self.id = sid
+        self.audio_file = audio_file
+        self.duration = duration
+        self.context = context
+        self.answer = answer
+        self.offset = offset
+        self.speaker = speaker
+        self.orig_sr = orig_sr
+        self.lang = lang
+
+
 class ASRVideoText(VideoText):
     """`VideoText` collector from cv structured json files."""
 
@@ -356,7 +375,13 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `VideoText` constructor.
         """
 
-        ids, video_files, durations, texts, offsets, = (
+        (
+            ids,
+            video_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
@@ -379,10 +404,272 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
+class SpeechLLMAudioText(object):
+    """List of audio-transcript text correspondence with preprocessing.
+
+    All of the audio, duration, context, answer are optional.
+    If answer is not present, text is treated as the answer.
+    """
+
+    def __init__(
+        self,
+        ids: List[int],
+        audio_files: List[str],
+        durations: List[float],
+        context_list: List[str],
+        answers: List[str],
+        offsets: List[str],
+        speakers: List[Optional[int]],
+        orig_sampling_rates: List[Optional[int]],
+        langs: List[Optional[str]],
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        max_number: Optional[int] = None,
+        do_sort_by_duration: bool = False,
+        index_by_file_id: bool = False,
+        max_num_samples: Optional[int] = None,
+    ):
+        """Instantiates audio-context-answer manifest with filters and preprocessing.
+
+
+        Args:
+            ids: List of examples positions.
+            audio_files: List of audio files.
+            durations: List of float durations.
+            context_list: List of raw text transcripts.
+            answers: List of raw text transcripts.
+            offsets: List of duration offsets or None.
+            speakers: List of optional speakers ids.
+            orig_sampling_rates: List of original sampling rates of audio files.
+            langs: List of language ids, one for eadh sample, or None.
+            min_duration: Minimum duration to keep entry with (default: None).
+            max_duration: Maximum duration to keep entry with (default: None).
+            max_number: Maximum number of samples to collect.
+            do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id.
+            index_by_file_id: If True, saves a mapping from filename base (ID) to index in data.
+        """
+
+        data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0
+        if index_by_file_id:
+            self.mapping = {}
+
+        for id_, audio_file, duration, offset, context, answer, speaker, orig_sr, lang in zip(
+            ids, audio_files, durations, offsets, context_list, answers, speakers, orig_sampling_rates, langs
+        ):
+            # Duration filters.
+            if duration is not None:
+                curr_min_dur = min(duration) if isinstance(duration, list) else duration
+                curr_max_dur = max(duration) if isinstance(duration, list) else duration
+                curr_sum_dur = sum(duration) if isinstance(duration, list) else duration
+                if min_duration is not None and curr_min_dur < min_duration:
+                    duration_filtered += curr_sum_dur
+                    num_filtered += 1
+                    continue
+
+                if max_duration is not None and curr_max_dur > max_duration:
+                    duration_filtered += curr_sum_dur
+                    num_filtered += 1
+                    continue
+                total_duration += curr_sum_dur
+
+            if answer is None:
+                duration_filtered += curr_sum_dur
+                num_filtered += 1
+                continue
+
+            data.append(
+                SpeechLLMAudioTextEntity(id_, audio_file, duration, context, answer, offset, speaker, orig_sr, lang)
+            )
+            if index_by_file_id and audio_file is not None:
+                file_id, _ = os.path.splitext(os.path.basename(audio_file))
+                if file_id not in self.mapping:
+                    self.mapping[file_id] = []
+                self.mapping[file_id].append(len(data) - 1)
+
+            # Max number of entities filter.
+            if len(data) == max_number:
+                break
+
+        if max_num_samples is not None and not index_by_file_id:
+            if max_num_samples <= len(data):
+                logging.info(f"Subsampling dataset from {len(data)} to {max_num_samples} samples")
+                data = data[:max_num_samples]
+            else:
+                logging.info(f"Oversampling dataset from {len(data)} to {max_num_samples} samples")
+                data = data * (max_num_samples // len(data))
+                res_num = max_num_samples % len(data)
+                res_data = [data[idx] for idx in np.random.choice(len(data), res_num, replace=False)]
+                data.extend(res_data)
+        elif max_num_samples is not None and index_by_file_id:
+            logging.warning("Tried to subsample dataset by max_num_samples, but cannot since index_by_file_id is set.")
+
+        if do_sort_by_duration:
+            if index_by_file_id:
+                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+            else:
+                data.sort(key=lambda entity: entity.duration)
+
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+
+        self.data = data
+
+    def __getitem__(self, idx):
+        if idx < 0 or idx > len(self.data):
+            raise ValueError(f"index out of range [0,{len(self.data)}), got {idx} instead")
+        return self.data[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class SpeechLLMAudioTextCollection(SpeechLLMAudioText):
+    """`SpeechLLMAudioText` collector from SpeechLLM json files.
+
+    This collector also keeps backward compatibility with SpeechLLMAudioText.
+    """
+
+    def __init__(
+        self,
+        manifests_files: Union[str, List[str]],
+        context_file: Optional[Union[List[str], str]] = None,
+        context_key: str = "context",
+        answer_key: str = "answer",
+        *args,
+        **kwargs,
+    ):
+        """Parse lists of audio files, durations and transcripts texts.
+
+        Args:
+            manifests_files: Either single string file or list of such -
+                manifests to yield items from.
+            *args: Args to pass to `AudioText` constructor.
+            **kwargs: Kwargs to pass to `AudioText` constructor.
+        """
+        self.context_key = context_key
+        self.answer_key = answer_key
+
+        (
+            ids,
+            audio_files,
+            durations,
+            context_list,
+            answers,
+            offsets,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        speakers, orig_srs, langs = (
+            [],
+            [],
+            [],
+        )
+        if context_file is not None:
+            question_file_list = context_file.split(",") if isinstance(context_file, str) else context_file
+            self.context_list = []
+            for filepath in question_file_list:
+                with open(filepath, 'r') as f:
+                    for line in f.readlines():
+                        line = line.strip()
+                        if line:
+                            self.context_list.append(line)
+            logging.info(f"Use random text context from {context_file} for {manifests_files}")
+        else:
+            self.context_list = None
+
+        for item in manifest.item_iter(manifests_files, parse_func=self.__parse_item):
+            ids.append(item['id'])
+            audio_files.append(item['audio_file'])
+            durations.append(item['duration'])
+            context_list.append(item['context'])
+            answers.append(item['answer'])
+            offsets.append(item['offset'])
+            speakers.append(item['speaker'])
+            orig_srs.append(item['orig_sr'])
+            langs.append(item['lang'])
+        super().__init__(
+            ids, audio_files, durations, context_list, answers, offsets, speakers, orig_srs, langs, *args, **kwargs
+        )
+
+    def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
+        item = json.loads(line)
+
+        # Audio file
+        if 'audio_filename' in item:
+            item['audio_file'] = item.pop('audio_filename')
+        elif 'audio_filepath' in item:
+            item['audio_file'] = item.pop('audio_filepath')
+        elif 'audio_file' not in item:
+            item['audio_file'] = None
+
+        # If the audio path is a relative path and does not exist,
+        # try to attach the parent directory of manifest to the audio path.
+        # Revert to the original path if the new path still doesn't exist.
+        # Assume that the audio path is like "wavs/xxxxxx.wav".
+        if item['audio_file'] is not None:
+            item['audio_file'] = manifest.get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file)
+
+        # Duration.
+        if 'duration' not in item:
+            item['duration'] = None
+
+        # Answer.
+        if self.answer_key in item:
+            item['answer'] = item.pop(self.answer_key)
+        elif 'text' in item:
+            # compatability with ASR manifests that uses 'text' as answer key
+            item['answer'] = item.pop('text')
+        elif 'text_filepath' in item:
+            with open(item.pop('text_filepath'), 'r') as f:
+                item['answer'] = f.read()
+        else:
+            item['answer'] = "na"
+
+        # context.
+        if self.context_key in item:
+            item['context'] = item.pop(self.context_key)
+        elif 'context_filepath' in item:
+            with open(item.pop('context_filepath'), 'r') as f:
+                item['context'] = f.read()
+        elif self.context_list is not None:
+            context = np.random.choice(self.context_list).strip()
+            item['context'] = context
+        elif 'question' in item:
+            # compatability with old manifests that uses 'question' as context key
+            logging.warning(
+                f"Neither `{self.context_key}` is found nor `context_file` is set, but found `question` in item: {item}",
+                mode=logging_mode.ONCE,
+            )
+            item['context'] = item.pop('question')
+        else:
+            # default context if nothing is found
+            item['context'] = "what does this audio mean"
+
+        item = dict(
+            audio_file=item['audio_file'],
+            duration=item['duration'],
+            context=str(item['context']),
+            answer=str(item['answer']),
+            offset=item.get('offset', None),
+            speaker=item.get('speaker', None),
+            orig_sr=item.get('orig_sample_rate', None),
+            lang=item.get('lang', None),
+        )
+        return item
+
+
 class SpeechLabel(_Collection):
     """List of audio-label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='SpeechLabelEntity', field_names='audio_file duration label offset',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='SpeechLabelEntity',
+        field_names='audio_file duration label offset',
+    )
 
     def __init__(
         self,
@@ -532,7 +819,10 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
 class FeatureSequenceLabel(_Collection):
     """List of feature sequence of label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureSequenceLabelEntity', field_names='feature_file seq_label',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureSequenceLabelEntity',
+        field_names='feature_file seq_label',
+    )
 
     def __init__(
         self,
@@ -614,9 +904,11 @@ class ASRFeatureSequenceLabel(FeatureSequenceLabel):
     """`FeatureSequenceLabel` collector from asr structured json files."""
 
     def __init__(
-        self, manifests_files: Union[str, List[str]], max_number: Optional[int] = None, index_by_file_id: bool = False,
+        self,
+        manifests_files: Union[str, List[str]],
+        max_number: Optional[int] = None,
+        index_by_file_id: bool = False,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -655,7 +947,10 @@ def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
                 f"Manifest file has invalid json line " f"structure: {line} without proper seq_label key."
             )
 
-        item = dict(feature_file=item['feature_file'], seq_label=item['seq_label'],)
+        item = dict(
+            feature_file=item['feature_file'],
+            seq_label=item['seq_label'],
+        )
 
         return item
 
@@ -759,7 +1054,8 @@ def __init__(
                 data.sort(key=lambda entity: entity.duration)
 
         logging.info(
-            "Filtered duration for loading collection is %f.", duration_filtered,
+            "Filtered duration for loading collection is %f.",
+            duration_filtered,
         )
         logging.info(f"Total {len(data)} session files loaded accounting to # {len(audio_files)} audio clips")
 
@@ -937,8 +1233,7 @@ def __parse_item_rttm(self, line: str, manifest_file: str) -> Dict[str, Any]:
 
 
 class Audio(_Collection):
-    """Prepare a list of all audio items, filtered by duration.
-    """
+    """Prepare a list of all audio items, filtered by duration."""
 
     OUTPUT_TYPE = collections.namedtuple(typename='Audio', field_names='audio_files duration offset text')
 
@@ -999,11 +1294,14 @@ def __init__(
 
 
 class AudioCollection(Audio):
-    """List of audio files from a manifest file.
-    """
+    """List of audio files from a manifest file."""
 
     def __init__(
-        self, manifest_files: Union[str, List[str]], audio_to_manifest_key: Dict[str, str], *args, **kwargs,
+        self,
+        manifest_files: Union[str, List[str]],
+        audio_to_manifest_key: Dict[str, str],
+        *args,
+        **kwargs,
     ):
         """Instantiates a list of audio files loaded from a manifest file.
 
@@ -1045,6 +1343,7 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
         Returns:
             Dictionary with audio_files, duration, and offset.
         """
+
         # Local utility function
         def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
             """Get item[key] if key is string, or a list
@@ -1117,7 +1416,10 @@ def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
 class FeatureLabel(_Collection):
     """List of feature sequence and their label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureLabelEntity', field_names='feature_file label duration',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureLabelEntity',
+        field_names='feature_file label duration',
+    )
 
     def __init__(
         self,
@@ -1194,7 +1496,6 @@ def __init__(
         *args,
         **kwargs,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -1383,7 +1684,14 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `AudioText` constructor.
         """
 
-        ids, feature_files, rttm_files, durations, texts, offsets, = (
+        (
+            ids,
+            feature_files,
+            rttm_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
diff --git a/nemo/io/__init__.py b/nemo/collections/common/prompts/__init__.py
similarity index 100%
rename from nemo/io/__init__.py
rename to nemo/collections/common/prompts/__init__.py
diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
new file mode 100644
index 000000000000..aadc976ba474
--- /dev/null
+++ b/nemo/collections/common/prompts/canary.py
@@ -0,0 +1,71 @@
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+from nemo.collections.common.tokenizers.canary_tokenizer import (
+    CANARY_BOS,
+    CANARY_EOS,
+    CANARY_NOPNC,
+    CANARY_PNC,
+    CANARY_SPECIAL_TOKENIZER,
+)
+
+
+class CanaryPromptFormatter(PromptFormatter):
+    NAME = "canary"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{CANARY_BOS}|source_lang||task||target_lang||pnc|",
+            "slots": {
+                "source_lang": Modality.Text,
+                "task": Modality.Text,
+                "target_lang": Modality.Text,
+                "pnc": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|text|{CANARY_EOS}",
+            "slots": {
+                "text": Modality.Text,
+            },
+        },
+    }
+
+    def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]:
+        # This method handles a level of indirection for Canary.
+        # It maps values provided in trcfg to the actual special tokens
+        # expected to be present in canary prompt.
+        # It used to be done in prompt_format_fn inside Dataset class corresponding to Canary,
+        # but we are not using it here anymore.
+        # This maps things such as '|task|: "asr"' to '|TASK|: "<|transcribe|>"'.
+        slot_values = map_manifest_values_to_special_tokens(slot_values)
+        return super().encode_turn(
+            prompt_template=prompt_template, expected_slots=expected_slots, slot_values=slot_values
+        )
+
+
+def map_manifest_values_to_special_tokens(slot_values: dict[str, str]) -> dict[str, str]:
+    slot_values = slot_values.copy()
+
+    any_special_token_present = False
+
+    for k in ("source_lang", "target_lang"):
+        if k in slot_values and not ((v := slot_values[k]).startswith("<|") and v.endswith("|>")):
+            slot_values[k] = "<|" + slot_values[k] + "|>"
+            any_special_token_present = True
+
+    k = "pnc"
+    if k in slot_values and slot_values[k] not in (CANARY_PNC, CANARY_NOPNC):
+        slot_values[k] = CANARY_PNC if slot_values[k] in ("yes", "1", "True", "true") else CANARY_NOPNC
+        any_special_token_present = True
+
+    # Note: we re-map 'taskname' to 'task' for compatibility with earlier versions of Canary training.
+    for k in ("task", "taskname"):
+        if k in slot_values and slot_values[k] not in ("<|transcribe|>", "<|translate|>"):
+            slot_values["task"] = "<|transcribe|>" if slot_values[k] == "asr" else "<|translate|>"
+            any_special_token_present = True
+
+    # Auto-inject which tokenizer to look up in CanaryTokenizer if not provided,
+    # and slots for this turn correspond to user prompt.
+    if any_special_token_present and PromptFormatter.PROMPT_LANGUAGE_SLOT not in slot_values:
+        slot_values[PromptFormatter.PROMPT_LANGUAGE_SLOT] = CANARY_SPECIAL_TOKENIZER
+
+    return slot_values
diff --git a/nemo/collections/common/prompts/example.py b/nemo/collections/common/prompts/example.py
new file mode 100644
index 000000000000..3589efb938f4
--- /dev/null
+++ b/nemo/collections/common/prompts/example.py
@@ -0,0 +1,36 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class ExamplePromptFormatter(PromptFormatter):
+    """
+    The simplest possible prompt formatter implementation.
+
+    It defines a dialog of the form:
+
+        User: Hi.
+        Assistant: Hi, how can I help you?
+        User: What's the time?
+        Assistant: It's 9 o'clock.
+
+    """
+
+    NAME = "example_prompt_format"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"User: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"Assistant: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/formatter.py b/nemo/collections/common/prompts/formatter.py
new file mode 100644
index 000000000000..524b2e62c5a3
--- /dev/null
+++ b/nemo/collections/common/prompts/formatter.py
@@ -0,0 +1,347 @@
+from abc import ABC
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Type
+
+import torch
+
+from nemo.collections.common.tokenizers import AggregateTokenizer, TokenizerSpec
+
+PREAMBLE_ROLE = "preamble"
+
+# Slots used to define when special tokens bos/eos should be inserted.
+# These are special in the sense of how sentencepiece defines special tokens:
+# They have to be specially inserted into the token sequence, and if they appear in the tokenized string,
+# SPE wouldn't use the special token ids but rather tokenize them as if they were normal strings.
+# We mimic SPE's behavior if these special slots are present in the template definition.
+# To achieve that, insert |bos| / |eos| at the beginning/end of template.
+# E.g., inserting only bos in llama2 user role: "template": "|bos|[INST] |message| [\INST]"
+BOS_SLOT = "|bos|"
+EOS_SLOT = "|eos|"
+
+
+class Modality(Enum):
+    """
+    Modalities supported as PromptFormatter slot values.
+    """
+
+    Text = "text"
+
+    def matches(self, value: Any) -> bool:
+        """
+        Checks if the provided value is compatible with an instance of Modality.
+        """
+        match self:
+            case Modality.Text:
+                return isinstance(value, str)
+            case _:
+                return False
+
+
+class PromptFormatter(ABC):
+    """
+    :class:`~nemo.collections.common.prompts.formatter.PromptFormatter` is intended to simplify
+    working with various prompt format templates and encoding them into token ID tensors.
+
+    It assumes a dialog-like structure, which is a list of turns, with each turn assigned to a role.
+    Sub-classes of PromptFormatter define turn templates for each role under TEMPLATE class attribute.
+    Each template may define some constant parts (e.g. begin-of-turn or end-of-turn tokens, whitespaces, etc.)
+    and variable parts which we call "slots", that will be provided by the user during training or inference.
+
+    A role is typically "user" and "assistant", and some popular models also use a "system" role.
+    Other roles may be defined as well. We expect the role corresponding to the model's responses
+    will be registered under class attribute called OUTPUT_ROLE.
+    We reserve a special "preamble" role with no slots that will be inserted at the beginning of
+    the formatted prompt, if "preamble" is present in TEMPLATE.
+
+    A turn is a dict with keys "role" and "slots", where "slots" are a dict that maps slot names
+    to values that should be filled in the template.
+    For example, a user role template may be ``"Question: |message|"`` and corresponding ``slots`` would then be
+    ``{"message": "What time is it?"}``.
+
+    There is a special slot called ``|prompt_language|`` that's used to select the sub-tokenizer in
+    :class:`~nemo.collections.common.tokenizers.aggregate_tokenizer.AggregateTokenizer`.
+    It's only used when the tokenizer is aggregate; otherwise it's discarded.
+
+    PromptFormatter supports constructing prompts for training (complete context and answers)
+    and for inference (context-only).
+    Training/inference is determined automatically; if the last role in a dialog is the OUTPUT_ROLE,
+    that's an 'asked-and-answered' scenario, so we assume it's inteded for training.
+    We'll create a dict with tokenized results available under the following keys:
+
+    * ``context_ids`` (all turns minus last one),
+    * ``answer_ids`` (last turn)
+    * ``input_ids`` (previous two values concatenated)
+    * ``mask`` (boolean mask tensor of the same lenth as ``input_ids`` that's set to True on OUTPUT_ROLE turns)
+
+    Typically, the user will use the ``encode_dialog`` method providing a list of turns to it.
+    Example showing how to construct model inputs/outputs for training::
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...         {"role": "assistant", "slots": {"message": "AM, naturally! It's bright outside"}},
+        ...     ]
+        ... )
+
+    Another example that shows how to use the same method to generate prompts for inference::
+
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...     ]
+        ... )
+
+    """
+
+    # Used to support AggregateTokenizer; this key selects the right sub-tokenizer for each turn.
+    PROMPT_LANGUAGE_SLOT = "prompt_language"
+
+    # Subclasses will be registered under this name, to be used via PromptFormatter.resolve(name).
+    NAME = None
+
+    # Template is a dict that maps:
+    # * from a role name string (system/user/assistant/etc)
+    # * to a dict with keys
+    #   * "template" that has a string value (the prompt template)
+    #   * "slots" that has a value of dict[str, Modality]
+    #       * keys of slots are the names of formattable slots in the prompt template
+    #       * values of slots are :class:`Modality` objects that can be used to check
+    #           whether a specific value conforms to a given modality requirements
+    #           (e.g., Modality.Text may expect string objects).
+    # Template is intended to be defined by the child classes.
+    TEMPLATE = None
+
+    # Turns under this role indicate responses by the model; if the last turn in
+    # PromptFormatter.encode_dialog() ends with this role, it indicates a training example.
+    OUTPUT_ROLE = None
+
+    # Internal reserved field.
+    _REGISTERED_FORMATTERS = {}
+
+    def __init__(self, tokenizer: TokenizerSpec, defaults: list[dict] | None = None) -> None:
+        self.tokenizer = tokenizer
+        self._defaults = defaults if defaults is not None else []
+        self._validate_defaults()
+
+    def __init_subclass__(cls, **kwargs) -> None:
+        ERR = "PromptFormatter subclass definition error:"
+        if cls.__name__ not in cls._REGISTERED_FORMATTERS:
+            for attr in ("NAME", "TEMPLATE", "OUTPUT_ROLE"):
+                assert (
+                    getattr(cls, attr, None) is not None
+                ), f"{ERR} PromptFormatter subclass {cls} did not define a class attribute {attr}"
+            assert cls.NAME not in cls._REGISTERED_FORMATTERS, (
+                f"Cannot register {cls.__name__} under {cls.NAME}: another prompt formatter of type "
+                f"{cls._REGISTERED_FORMATTERS[cls.NAME]} has already been registered under this name."
+            )
+            cls._REGISTERED_FORMATTERS[cls.NAME] = cls
+        if "preamble" in cls.TEMPLATE:
+            assert (
+                len(cls.TEMPLATE["preamble"].get("slots", [])) == 0
+            ), f"{ERR} Slots are not allowed for preamble template, but we found: '{cls.TEMPLATE['preamble']}'"
+        for role in cls.get_roles():
+            template = cls.get_template(role)
+            for slot in cls.get_slots(role):
+                assert (
+                    _mangled(slot) in template
+                ), f"{ERR} Slot '{slot}' not found in template '{template}' for role '{role}'"
+        super().__init_subclass__(**kwargs)
+
+    @classmethod
+    def resolve(cls, name: str) -> Type["PromptFormatter"]:
+        if name not in cls._REGISTERED_FORMATTERS:
+            raise RuntimeError(
+                f"Unknown prompt formatter: '{name}' (known formats: {', '.join(cls._REGISTERED_FORMATTERS.keys())})"
+            )
+        return cls._REGISTERED_FORMATTERS[name]
+
+    @classmethod
+    @lru_cache(1)
+    def get_roles(cls) -> list[str]:
+        return list(cls.TEMPLATE.keys())
+
+    @classmethod
+    def get_slots(cls, role: str) -> dict[str, Modality]:
+        # returns a copy to avoid accidential mutation of a global object by the user
+        return cls.TEMPLATE[role].get("slots", {}).copy()
+
+    @classmethod
+    def get_template(cls, role: str) -> str:
+        return cls.TEMPLATE[role]["template"]
+
+    def get_default_dialog_slots(self) -> list[dict]:
+        """
+        Returns a list of dialog turns that can be used as a skeleton to fill with actual slot values.
+        If ``PromptFormatter`` was initialized with ``defaults`` argument, this method will return the
+        defaults. Otherwise, every slot is pre-filled with ``None``.
+        """
+
+        def _get_default_for_role(role: str) -> dict:
+            for turn in self._defaults:
+                if turn["role"] == role:
+                    return turn
+            return {}
+
+        return [
+            {
+                "role": role,
+                "slots": {
+                    slot: _get_default_for_role(role).get("slots", {}).get(slot) for slot in self.get_slots(role)
+                },
+            }
+            for role in self.get_roles()
+            if role != self.OUTPUT_ROLE
+        ]
+
+    def encode_turn(
+        self, prompt_template: str, expected_slots: dict[str, Modality], slot_values: dict[str, Any]
+    ) -> list[int]:
+        prompt = prompt_template
+        for slot in expected_slots:
+            # For the final substitution of 'slot' in the template we have to mangle it to '|slot|' anyway,
+            # but 'slot' form enables to use valid python identifiers as **kwargs
+            # for passing slots around in user functions.
+            value = slot_values.get(slot)
+            assert value is not None, f"Missing required {slot=} in {slot_values=} for {prompt_template=}"
+            prompt = prompt.replace(_mangled(slot), value)
+        return self._apply_tokenizer(prompt, lang=slot_values.get(self.PROMPT_LANGUAGE_SLOT))
+
+    def encode_dialog(self, turns: list[dict]) -> dict[str, torch.Tensor]:
+        assert len(turns) > 0, "Empty dialog is not supported."
+        roles = self.get_roles()
+
+        turn_tokens = []
+        turn_token_counts = []
+        turn_mask_values = []
+
+        if "preamble" in self.TEMPLATE:
+            preamble_turns = [idx for idx, t in enumerate(turns) if t["role"] == "preamble"]
+            if not preamble_turns:
+                turns = [{"role": "preamble", **self.TEMPLATE["preamble"]}] + turns
+            else:
+                assert (
+                    len(preamble_turns) == 1 and preamble_turns[0] == 0
+                ), f"Preamble can only be presented at turn 0, but we found preamble turns at indexes {preamble_turns}."
+
+        for turn in turns:
+            assert "role" in turn, f"A turn must have have a 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in roles, f"Found turn with {role=}, but availables roles are {roles}"
+            expected_slots = self.get_slots(role)
+            slot_values = turn.get("slots", {})
+            if expected_slots:
+                assert (
+                    slot_values
+                ), f"A turn for role {role} must have have a non-empty value under 'slots' key. We received {turn=}"
+                self._validate_slot_values(expected_slots, slot_values)
+            template = self.get_template(role)
+            tokens = self.encode_turn(template, expected_slots, slot_values)
+            turn_tokens.extend(tokens)
+            turn_token_counts.append(len(tokens))
+            turn_mask_values.append(role == self.OUTPUT_ROLE)
+
+        ans = {"input_ids": torch.tensor(turn_tokens, dtype=torch.long)}
+        if turn_mask_values[-1]:
+            # The last turn comes from OUTPUT_ROLE, i.e. it's a response from the system.
+            # This indicates it's a training example for which we provide context/answer/mask.
+            ans["context_ids"] = ans["input_ids"][: -turn_token_counts[-1]]
+            ans["answer_ids"] = ans["input_ids"][-turn_token_counts[-1] :]
+            ans["mask"] = torch.tensor(
+                [
+                    turn_mask_values[turn_idx]
+                    for turn_idx, turn_len in enumerate(turn_token_counts)
+                    for _ in range(turn_len)
+                ],
+                dtype=torch.bool,
+            )
+        else:
+            ans["context_ids"] = ans["input_ids"]  # context == input for inference
+        return ans
+
+    def _apply_tokenizer(self, text: str, lang: str | None = None) -> list[int]:
+        # Check if the tokenizer is aggregate and perform extra checks.
+        is_agg = isinstance(self.tokenizer, AggregateTokenizer)
+        if is_agg:
+            assert lang is not None, (
+                f"Missing key '{self.PROMPT_LANGUAGE_SLOT}' in slot_values -- cannot resolve "
+                f"the correct sub-tokenizer in the aggregate tokenizer."
+            )
+
+        # Strip bos/eos if present and remember to apply them later.
+        has_bos = text.startswith(BOS_SLOT)
+        has_eos = text.endswith(EOS_SLOT)
+        if has_bos:
+            text = text[len(BOS_SLOT) :]
+        if has_eos:
+            text = text[: -len(EOS_SLOT)]
+
+        # Tokenize, selecting the right API depending on aggregate/normal tokenizer.
+        if is_agg:
+            tokens = self.tokenizer.text_to_ids(text, lang)
+        else:
+            tokens = self.tokenizer.text_to_ids(text)
+
+        # Lazily look up bos/eos and apply them. Lazy has the advantage that if a tokenizer
+        # doesn't define bos/eos and the prompt format does not request them, everything just works.
+        if has_eos:
+            eos_id = self.tokenizer.get_eos(lang) if is_agg else self.tokenizer.eos
+            tokens.append(eos_id)
+        if has_bos:
+            bos_id = self.tokenizer.get_bos(lang) if is_agg else self.tokenizer.bos
+            tokens = [bos_id] + tokens
+
+        return tokens
+
+    def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None:
+        missing = set(expected) - set(received)
+        assert not missing, f"The following slot values were not provided: {missing}"
+        for slot in expected:
+            expected_modality = expected[slot]
+            value = received[slot]
+            assert expected_modality.matches(
+                value
+            ), f"{slot=} received {value=} which does not match modality {expected_modality}"
+
+    def _validate_defaults(self):
+        if not self._defaults:
+            return
+
+        err = "Error in default prompt definition:"
+        assert isinstance(self._defaults, list)
+        for turn in self._defaults:
+            assert isinstance(turn, dict)
+            assert "role" in turn, f"{err} Missing required 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in self.get_roles(), (
+                f"{err} Invalid {role=} in {turn=} - " f"supported roles are: {self.get_roles()}."
+            )
+            if expected_slots := self.get_slots(role):
+                assert "slots" in turn, (
+                    f"{err} Missing required 'slots' key in {turn=} - "
+                    f"we expected the following slots to be provided: {expected_slots}."
+                )
+                for slot in turn["slots"]:
+                    assert slot in expected_slots, (
+                        f"{err} Invalid {slot=} in {turn=}. "
+                        f"The following slots are supported for {role=}: {expected_slots}"
+                    )
+
+
+def _mangled(slot: str) -> str:
+    if not (slot[0] == "|" and slot[-1] == "|"):
+        return f"|{slot}|"
+    return slot
+
+
+def _unmangled(slot: str) -> str:
+    if slot[0] == "|" and slot[-1] == "|":
+        return slot[1:-1]
+    return slot
diff --git a/nemo/collections/common/prompts/gemma.py b/nemo/collections/common/prompts/gemma.py
new file mode 100644
index 000000000000..e3b81c848a3e
--- /dev/null
+++ b/nemo/collections/common/prompts/gemma.py
@@ -0,0 +1,29 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/gemma#gemma-7b-prompt-format
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+GEMMA_BOS = "<start_of_turn>"
+GEMMA_END_OF_TURN = "<end_of_turn>"
+GEMMA_NL = "\n\n"
+
+
+class GemmaPromptFormatter(PromptFormatter):
+    NAME = "gemma"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{GEMMA_BOS}user\n|message|{GEMMA_END_OF_TURN}\n{GEMMA_BOS}model\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            # Note: that trailing NL is bothering me.
+            "template": f"|message|{GEMMA_END_OF_TURN}\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/llama.py b/nemo/collections/common/prompts/llama.py
new file mode 100644
index 000000000000..fdaccfaa846e
--- /dev/null
+++ b/nemo/collections/common/prompts/llama.py
@@ -0,0 +1,72 @@
+from nemo.collections.common.prompts.formatter import BOS_SLOT, EOS_SLOT, Modality, PromptFormatter
+
+
+class Llama2PromptFormatter(PromptFormatter):
+    """
+    This template has been validated to provide identical tokenized results to the official code
+    in https://github.com/meta-llama/llama/blob/main/llama/generation.py
+    """
+
+    NAME = "llama2"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "system_and_user": {
+            "template": f"{BOS_SLOT}[INST] <<SYS>>\n|system|\n<</SYS>>\n\n|message| [/INST]",
+            "slots": {
+                "system": Modality.Text,
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": "|bos|[INST] |message| [/INST]",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message| {EOS_SLOT}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+LLAMA3_BOS = "<|begin_of_text|>"
+LLAMA3_HEADER_BEGIN = "<|start_header_id|>"
+LLAMA3_HEADER_END = "<|end_header_id|>"
+LLAMA3_END_OF_TURN = "<|eot_id|>"
+LLAMA3_NL = "\n\n"
+
+
+class Llama3PromptFormatter(PromptFormatter):
+    """
+    Implemented following the code at:
+     https://github.com/meta-llama/llama3/blob/main/llama/test_tokenizer.py#L56
+    """
+
+    NAME = "llama3"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": LLAMA3_BOS,
+        },
+        "system": {
+            "template": f"{LLAMA3_HEADER_BEGIN}system{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": f"{LLAMA3_HEADER_BEGIN}user{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"{LLAMA3_HEADER_BEGIN}assistant{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/mistral.py b/nemo/collections/common/prompts/mistral.py
new file mode 100644
index 000000000000..e882ac5973b1
--- /dev/null
+++ b/nemo/collections/common/prompts/mistral.py
@@ -0,0 +1,33 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/mistral-7b#chat-template-for-mistral-7b-instruct
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+MISTRAL_BOS = "<s>"
+MISTRAL_PROMPT_BEGIN = "[INST]"
+MISTRAL_PROMPT_END = "[/INST]"
+MISTRAL_END_OF_TURN = "</s>"
+MISTRAL_NL = "\n\n"
+
+
+class MistralPromptFormatter(PromptFormatter):
+    NAME = "mistral"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": MISTRAL_BOS,
+        },
+        "user": {
+            "template": f"{MISTRAL_PROMPT_BEGIN} |message| {MISTRAL_PROMPT_END} ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|{MISTRAL_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/phi2.py b/nemo/collections/common/prompts/phi2.py
new file mode 100644
index 000000000000..67dad8d5dd82
--- /dev/null
+++ b/nemo/collections/common/prompts/phi2.py
@@ -0,0 +1,62 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class Phi2QAPromptFormatter(PromptFormatter):
+    NAME = "phi2_qa"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Instruct: |message|\nOutput: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2ChatPromptFormatter(PromptFormatter):
+    NAME = "phi2_chat"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Human: |message|\nAI: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2CodePromptFormatter(PromptFormatter):
+    NAME = "phi2_code"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"|message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/tokenizers/aggregate_tokenizer.py b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
index 9c003c37525a..66ec28ebda4d 100644
--- a/nemo/collections/common/tokenizers/aggregate_tokenizer.py
+++ b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
@@ -15,6 +15,7 @@
 from typing import Dict, List, Union
 
 import numpy as np
+import torch
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
@@ -124,7 +125,7 @@ def tokens_to_text(self, tokens, lang_id):
         return tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         tokens = []
@@ -224,6 +225,12 @@ def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[st
             ids.append(self.token_to_id(token, lang_id))
         return ids
 
+    def get_bos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].bos + self.token_id_offset[lang_id]
+
+    def get_eos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].eos + self.token_id_offset[lang_id]
+
     @property
     def vocab(self):
         return self.vocabulary
diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py
index aed95c1f9312..6adcdd8cf734 100644
--- a/nemo/collections/common/tokenizers/canary_tokenizer.py
+++ b/nemo/collections/common/tokenizers/canary_tokenizer.py
@@ -24,7 +24,15 @@
 __all__ = ['CanaryTokenizer']
 
 # Default tokens for compatibility with Canary.
-DEFAULT_TOKENS = ["<|nospeech|>", "<pad>", "<|endoftext|>", "<|startoftranscript|>", "<|pnc|>", "<|nopnc|>"]
+CANARY_BOS = "<|startoftranscript|>"
+CANARY_EOS = "<|endoftext|>"
+CANARY_PAD = "<pad>"
+CANARY_NOSPEECH = "<|nospeech|>"
+CANARY_PNC = "<|pnc|>"
+CANARY_NOPNC = "<|nopnc|>"
+DEFAULT_TOKENS = [CANARY_NOSPEECH, CANARY_PAD, CANARY_EOS, CANARY_BOS, CANARY_PNC, CANARY_NOPNC]
+
+CANARY_SPECIAL_TOKENIZER = "spl_tokens"
 
 
 class CanaryTokenizer(AggregateTokenizer):
@@ -37,26 +45,51 @@ def __init__(self, tokenizers: Dict):
 
         # for easy access of special tokens
         self.special_tokens = {}
-        for special in tokenizers['spl_tokens'].vocab:
+        for special in tokenizers[CANARY_SPECIAL_TOKENIZER].vocab:
             # Search for special prompting tokens
-            if (special.startswith("<|") and special.endswith("|>")) or special == "<pad>":
-                self.special_tokens[special] = self.token_to_id(special, lang_id='spl_tokens')
+            if (special.startswith("<|") and special.endswith("|>")) or special == CANARY_PAD:
+                self.special_tokens[special] = self.token_to_id(special, lang_id=CANARY_SPECIAL_TOKENIZER)
 
     @cached_property
     def eos_id(self) -> int:
-        return self.special_tokens["<|endoftext|>"]
+        return self.special_tokens[CANARY_EOS]
 
     @cached_property
     def bos_id(self) -> int:
-        return self.special_tokens["<|startoftranscript|>"]
+        return self.special_tokens[CANARY_BOS]
 
     @cached_property
     def nospeech_id(self) -> int:
-        return self.special_tokens["<|nospeech|>"]
+        return self.special_tokens[CANARY_NOSPEECH]
 
     @cached_property
     def pad_id(self) -> int:
-        return self.special_tokens["<pad>"]
+        return self.special_tokens[CANARY_PAD]
+
+    def text_to_ids(self, text, lang_id) -> list[int]:
+        if lang_id == CANARY_SPECIAL_TOKENIZER:
+            return self._tokenize_special_prompt(text)
+        if text.endswith(CANARY_EOS):
+            return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
+        return super().text_to_ids(text[-len(CANARY_EOS) :], lang_id)
+
+    def _tokenize_special_prompt(self, text: str) -> list[int]:
+        """
+        Tokenize the input special prompt of the following schema:
+
+        <|startoftranscript|><|source_lang|><|taskname|><|target_lang|><|pnc|>
+
+        Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with.
+        """
+        ans = []
+        assert text.count('>') == 5, f"Expected exactly 5 special tokens in Canary's prompt, got: {text}."
+        assert text.startswith(CANARY_BOS), text
+        for _ in range(5):
+            token = text[: text.find(">") + 1]
+            ans.append(self.special_tokens[token])
+            text = text[len(token) :]
+        assert len(text) == 0, text
+        return ans
 
     def spl_token_to_id(self, token):
         if token_id := self.special_tokens.get(f"<|{token}|>", None):
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index b264890ce48d..dc0cef692ee2 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -26,9 +26,10 @@
 
 
 class AutoTokenizer(TokenizerSpec):
-    '''
+    """
         Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
-    '''
+
+    """
 
     def __init__(
         self,
@@ -52,7 +53,7 @@ def __init__(
                 For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. 
                 The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
             vocab_file: path to file with vocabulary which consists
-                of characters separated by '\n'.
+                of characters separated by newlines.
             mask_token: mask token 
             bos_token: the beginning of sequence token
             eos_token: the end of sequence token. Usually equal to sep_token
@@ -167,11 +168,13 @@ def add_special_tokens(self, special_tokens_dict: dict) -> int:
         """
         Adds a dictionary of special tokens (eos, pad, cls...). If special tokens are NOT in the vocabulary, they are added
         to it (indexed starting from the last index of the current vocabulary).
+
         Args:
             special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
                 [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
                 ``additional_special_tokens``].
-            Tokens are only added if they are not already in the vocabulary.
+                Tokens are only added if they are not already in the vocabulary.
+
         Returns:
             Number of tokens added to the vocabulary.
         """
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index b686322c0882..4a47f0e49b1e 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import sentencepiece
+import torch
 
 from nemo.collections.common.parts.utils import if_exist
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -28,7 +29,7 @@
 class SentencePieceTokenizer(TokenizerSpec):
     """
     Sentencepiecetokenizer https://github.com/google/sentencepiece.
-    
+
     Args:
         model_path: path to sentence piece tokenizer model. To create the model use create_spt_model()
         special_tokens: either list of special tokens or dictionary of token name to token value
@@ -87,7 +88,7 @@ def text_to_tokens(self, text):
 
         return self.tokenizer.encode_as_pieces(text)
 
-    def text_to_ids(self, text):
+    def text_to_ids(self, text, sample_alpha=None):
         if self.legacy:
             ids = []
             idx = 0
@@ -115,7 +116,10 @@ def text_to_ids(self, text):
             ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
             return ids
 
-        return self.tokenizer.encode_as_ids(text)
+        if sample_alpha is not None:
+            return self.tokenizer.encode_as_ids(text, enable_sampling=True, alpha=sample_alpha, nbest_size=-1)
+        else:
+            return self.tokenizer.encode_as_ids(text)
 
     def tokens_to_text(self, tokens):
         if isinstance(tokens, np.ndarray):
@@ -124,7 +128,7 @@ def tokens_to_text(self, tokens):
         return self.tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         if self.legacy:
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
new file mode 100644
index 000000000000..0f60fd7438b9
--- /dev/null
+++ b/nemo/collections/llm/__init__.py
@@ -0,0 +1,43 @@
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
+from nemo.collections.llm.api import export_ckpt, import_ckpt, pretrain, train, validate
+from nemo.collections.llm.gpt.data import (
+    DollyDataModule,
+    FineTuningDataModule,
+    MockDataModule,
+    PreTrainingDataModule,
+    SquadDataModule,
+)
+from nemo.collections.llm.gpt.model import (
+    GPTConfig,
+    GPTModel,
+    MaskedTokenLossReduction,
+    Mistral7BConfig,
+    Mistral7BModel,
+    gpt_data_step,
+    gpt_forward_step,
+)
+
+__all__ = [
+    "MockDataModule",
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "MaskedTokenLossReduction",
+    "Mistral7BConfig",
+    "Mistral7BModel",
+    "PreTrainingDataModule",
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "train",
+    "import_ckpt",
+    "export_ckpt",
+    "pretrain",
+    "validate",
+]
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
new file mode 100644
index 000000000000..824d84ffb461
--- /dev/null
+++ b/nemo/collections/llm/api.py
@@ -0,0 +1,161 @@
+from pathlib import Path
+from typing import Callable, Optional
+
+import pytorch_lightning as pl
+
+from nemo.collections.llm.utils import task
+from nemo.lightning import MegatronStrategy, Trainer, io, teardown
+
+
+@task(namespace="llm")
+def train(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    """
+    Trains a model using the specified data and trainer, with optional tokenizer, source, and export.
+
+    Args:
+        model (pl.LightningModule): The model to be trained.
+        data (pl.LightningDataModule): The data module containing training data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
+        source (Optional[str]): Path to a checkpoint from which to continue training.
+        export (Optional[str]): Filename to save the exported checkpoint after training.
+
+    Returns
+    -------
+        Path: The directory path where training artifacts are saved.
+
+    Raises
+    ------
+        ValueError: If the trainer's strategy is not MegatronStrategy.
+
+    Examples
+    --------
+        >>> model = MyModel()
+        >>> data = MyDataModule()
+        >>> trainer = Trainer(strategy=MegatronStrategy())
+        >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
+        PosixPath('/path/to/log_dir')
+    """
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    fit_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if hasattr(train, "__io__"):
+        _save_config_img(run_dir, train.__io__)
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, fit_kwargs)
+
+    trainer.fit(model, data, **fit_kwargs)
+
+    print(f"Saving checkpoint to: {export_dir}")
+    trainer.save_checkpoint(export_dir)
+
+    if export and trainer.strategy.is_global_zero:
+        teardown(trainer, model=model)
+        print(f"Exporting checkpoint to: {export_dir / export}")
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(namespace="llm")
+def pretrain(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    source: Optional[str] = None,
+    # export: Optional[str] = None
+) -> Path:
+    return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source)
+
+
+@task(namespace="llm")
+def validate(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    validate_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, validate_kwargs)
+
+    trainer.validate(model, data, **validate_kwargs)
+    trainer.save_checkpoint(export_dir)
+    if export:
+        teardown(trainer)
+        del trainer, model, data
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(name="import", namespace="llm")
+def import_ckpt(
+    model: pl.LightningModule,
+    source: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+) -> Path:
+    return io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite)
+
+
+def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector:
+    return io.load_ckpt(path).model.exporter(target, path)
+
+
+@task(name="export", namespace="llm")
+def export_ckpt(
+    path: Path,
+    target: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+    load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt,
+) -> Path:
+    return io.export_ckpt(path, target, output_path, overwrite, load_connector)
+
+
+def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None:
+    if tokenizer == "data":
+        model.tokenizer = data.tokenizer
+    elif tokenizer == "model":
+        data.tokenizer = model.tokenizer
+
+
+def _add_ckpt_path(source, model, kwargs) -> None:
+    if io.is_distributed_ckpt(source):
+        kwargs["ckpt_path"] = source
+    else:
+        kwargs["ckpt_path"] = model.import_ckpt(source)
+
+
+def _save_config_img(*args, **kwargs):
+    try:
+        from nemo_sdk.utils import save_config_img
+
+        save_config_img(*args, **kwargs)
+    except ImportError:
+        pass
diff --git a/nemo/llm/gpt/__init__.py b/nemo/collections/llm/gpt/__init__.py
similarity index 100%
rename from nemo/llm/gpt/__init__.py
rename to nemo/collections/llm/gpt/__init__.py
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
new file mode 100644
index 000000000000..f83da73c987b
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -0,0 +1,7 @@
+from nemo.collections.llm.gpt.data.dolly import DollyDataModule
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+
+__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/collections/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py
new file mode 100644
index 000000000000..8d99583016a4
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/core.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+
+from nemo.lightning.base import NEMO_DATASETS_CACHE
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+
+
+def get_dataset_root(name: str) -> Path:
+    output = Path(NEMO_DATASETS_CACHE) / name
+    output.mkdir(parents=True, exist_ok=True)
+
+    return output
+
+
+def create_sft_dataset(
+    path: Path,
+    tokenizer: "TokenizerSpec",
+    seq_length: int = 2048,
+    add_bos: bool = False,
+    add_eos: bool = True,
+    add_sep: bool = False,
+    seed: int = 1234,
+    label_key: str = 'output',
+    answer_only_loss: bool = True,
+    truncation_field: str = 'input',
+    pad_to_max_length: bool = False,
+    index_mapping_dir: Optional[str] = None,
+    prompt_template: str = '{input} {output}',
+    truncation_method: str = 'right',
+    memmap_workers: int = 2,
+    hf_dataset: bool = False,
+    **kwargs,
+) -> "GPTSFTDataset":
+    from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+
+    return GPTSFTDataset(
+        file_path=str(path),
+        tokenizer=tokenizer,
+        max_seq_length=seq_length,
+        memmap_workers=memmap_workers,
+        hf_dataset=hf_dataset,
+        add_bos=add_bos,
+        add_eos=add_eos,
+        add_sep=add_sep,
+        seed=seed,
+        label_key=label_key,
+        answer_only_loss=answer_only_loss,
+        truncation_field=truncation_field,
+        pad_to_max_length=pad_to_max_length,
+        index_mapping_dir=index_mapping_dir,
+        prompt_template=prompt_template,
+        truncation_method=truncation_method,
+        **kwargs,
+    )
diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
new file mode 100644
index 000000000000..9632a142eb35
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -0,0 +1,122 @@
+import json
+import shutil
+from typing import TYPE_CHECKING, List, Optional
+
+import numpy as np
+from datasets import load_dataset
+
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class DollyDataModule(FineTuningDataModule):
+    """A data module for fine-tuning on the Dolly dataset.
+
+    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
+    "databricks/databricks-dolly-15k" dataset. It handles data download, preprocessing, splitting, and preparing the data
+    in a format suitable for training, validation, and testing.
+
+    Args:
+        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
+        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
+        See FineTuningDataModule for the other args
+    """
+
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        super().__init__(
+            dataset_root=get_dataset_root("dolly"),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+
+    def prepare_data(self) -> None:
+        # if train file is specified, no need to do anything
+        if self.train_path.exists() and not self.force_redownload:
+            return
+
+        dset = self._download_data()
+        self._preprocess_and_split_data(dset)
+
+    def _download_data(self):
+        logging.info(f"Downloading {self.__class__.__name__}...")
+        return load_dataset(
+            "databricks/databricks-dolly-15k",
+            cache_dir=str(self.dataset_root),
+            download_mode="force_redownload" if self.force_redownload else None,
+        )
+
+    def _preprocess_and_split_data(self, dset, train_ratio: float = 0.80, val_ratio: float = 0.15):
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+
+        test_ratio = 1 - train_ratio - val_ratio
+        save_splits = {}
+        dataset = dset.get('train')
+        split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=self.seed)
+        split_dataset2 = split_dataset['test'].train_test_split(
+            test_size=test_ratio / (val_ratio + test_ratio), seed=self.seed
+        )
+        save_splits['training'] = split_dataset['train']
+        save_splits['validation'] = split_dataset2['train']
+        save_splits['test'] = split_dataset2['test']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+            with output_file.open("w", encoding="utf-8") as f:
+                for example in dataset:
+                    context = example["context"].strip()
+                    if context != "":
+                        # Randomize context and instruction order.
+                        context_first = np.random.randint(0, 2) == 0
+                        if context_first:
+                            instruction = example["instruction"].strip()
+                            assert instruction != ""
+                            _input = f"{context}\n\n{instruction}"
+                            _output = example["response"]
+                        else:
+                            instruction = example["instruction"].strip()
+                            assert instruction != ""
+                            _input = f"{instruction}\n\n{context}"
+                            _output = example["response"]
+                    else:
+                        _input = example["instruction"]
+                        _output = example["response"]
+
+                    f.write(json.dumps({"input": _input, "output": _output, "category": example["category"]}) + "\n")
+
+            logging.info(f"{split_name} split saved to {output_file}")
+
+        if self.delete_raw:
+            for p in self.dataset_root.iterdir():
+                if p.is_dir():
+                    shutil.rmtree(p)
+                elif '.jsonl' not in str(p.name):
+                    p.unlink()
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
new file mode 100644
index 000000000000..1be5c41e4919
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -0,0 +1,111 @@
+from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+
+from nemo.collections.llm.gpt.data.core import create_sft_dataset
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class FineTuningDataModule(pl.LightningDataModule):
+    """Base class for fine-tuning an LLM.
+
+    This class provides a foundation for building custom data modules for fine-tuning Nemo NLP models. It inherits from
+    `pl.LightningDataModule` from the PyTorch Lightning library and handles data loading, preprocessing, and batch creation
+    for training, validation, and testing.
+
+    Args:
+        dataset_root (Union[str, Path]): The root directory containing the training, validation, and test data.
+        seq_length (int, optional): The maximum sequence length for the input and output text. Defaults to 2048.
+        tokenizer (Optional[TokenizerSpec], optional): The tokenizer to use for preprocessing the text. Defaults to None.
+            If not provided, a Megatron GPT2 BPE tokenizer will be used.
+        micro_batch_size (int, optional): The micro batch size for training. Defaults to 4.
+        global_batch_size (int, optional): The global batch size for training. Defaults to 8.
+        rampup_batch_size (Optional[List[int]], optional): A list of batch sizes for ramping up during training. Defaults to None.
+        seed (int, optional): The random seed for data shuffling. Defaults to 1234.
+        memmap_workers (int, optional): The number of worker processes for loading data using TextMemMapDataset. Defaults to 1.
+        num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
+        pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True.
+        persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        dataset_root: Union[str, Path],
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.seed = seed
+        self.dataset_root = Path(dataset_root)
+
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        self.memmap_workers = memmap_workers
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return self._create_dataloader(self._create_dataset(str(self.train_path)))
+
+    def val_dataloader(self) -> DataLoader:
+        return self._create_dataloader(self._create_dataset(str(self.validation_path)))
+
+    def test_dataloader(self) -> DataLoader:
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.test_path),
+                tokens_to_generate=32,
+                is_test=True,
+            )
+        )
+
+    @lru_cache
+    def _create_dataset(self, path, **kwargs):
+        return create_sft_dataset(
+            path, tokenizer=self.tokenizer, seq_length=self.seq_length, memmap_workers=self.memmap_workers, **kwargs
+        )
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+    @property
+    def train_path(self) -> Path:
+        return self.dataset_root / "training.jsonl"
+
+    @property
+    def validation_path(self) -> Path:
+        return self.dataset_root / "validation.jsonl"
+
+    @property
+    def test_path(self) -> Path:
+        return self.dataset_root / "test.jsonl"
diff --git a/nemo/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
similarity index 97%
rename from nemo/llm/gpt/data/mock.py
rename to nemo/collections/llm/gpt/data/mock.py
index ff035a78453d..ccc1acfd6a2a 100644
--- a/nemo/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -74,7 +74,12 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
 
 class _MockGPTDataset(Dataset):
     def __init__(
-        self, tokenizer: "TokenizerSpec", name: str, num_samples: int, seq_length: int, seed: int = 42,
+        self,
+        tokenizer: "TokenizerSpec",
+        name: str,
+        num_samples: int,
+        seq_length: int,
+        seed: int = 42,
     ) -> None:
         super().__init__()
         self.name = name
@@ -118,7 +123,7 @@ def _collate_fn(self, batch):
 
     def collate_fn(self, batch):
         """Method that user pass as functor to DataLoader.
-        
+
         The method optionally performs neural type checking and add types to the outputs.
 
         Please note, subclasses of Dataset should not implement `input_types`.
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
new file mode 100644
index 000000000000..80e099290b1d
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils.data import DataLoader
+
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+class PreTrainingDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        path: Path,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        reset_position_ids: bool = False,
+        reset_attention_mask: bool = False,
+        eod_mask_loss: bool = False,
+        seed: int = 1234,
+        split: str = "900,50,50",
+    ) -> None:
+        super().__init__()
+        self.path = path
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.reset_position_ids = reset_position_ids
+        self.reset_attention_mask = reset_attention_mask
+        self.eod_mask_loss = eod_mask_loss
+        self.seed = seed
+        self.split = split
+
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+        from megatron.core.datasets.gpt_dataset import GPTDataset
+
+        assert (
+            hasattr(self, "trainer") and self.trainer is not None
+        ), "Setup should be completed when trainer and config are attached."
+
+        # Trainer API
+        max_train_steps = self.trainer.max_steps
+        assert max_train_steps > 0, "Please specify trainer.max_steps"
+        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
+        test_iters = self.trainer.limit_test_batches
+        num_train_samples = max_train_steps * self.data_sampler.global_batch_size
+        num_val_samples = eval_iters * self.data_sampler.global_batch_size
+        num_test_samples = test_iters * self.data_sampler.global_batch_size
+
+        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            # This is to make sure we only have one epoch on every validation iteration
+            num_val_samples = 1
+
+        train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+        self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+            GPTDataset,
+            train_valid_test_num_samples,
+            is_built_on_rank=lambda: True,
+            config=self.gpt_dataset_config,
+        ).build()
+
+    # uncomment once fabric API is merged
+    # def fabric_setup(
+    #     self,
+    #     fabric: fl.Fabric,
+    #     num_train_samples: int,
+    #     num_val_samples: int,
+    #     num_test_samples: int,
+    # ) -> None:
+    #     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+    #     from megatron.core.datasets.gpt_dataset import GPTDataset
+    #
+    #     del fabric
+    #     train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+    #     self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+    #         GPTDataset, train_valid_test_num_samples, self.gpt_dataset_config,
+    #     ).build()
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+    @property
+    def gpt_dataset_config(self) -> "GPTDatasetConfig":
+        from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+
+        return GPTDatasetConfig(
+            blend=[[str(self.path)], [1.0]],
+            random_seed=self.seed,
+            sequence_length=self.seq_length,
+            tokenizer=self.tokenizer,
+            split=self.split,
+            path_to_cache=None,
+            reset_position_ids=self.reset_position_ids,
+            reset_attention_mask=self.reset_attention_mask,
+            eod_mask_loss=self.eod_mask_loss,
+        )
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
new file mode 100644
index 000000000000..77d48da98a0e
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -0,0 +1,126 @@
+import json
+import shutil
+from typing import TYPE_CHECKING, List, Optional
+
+from datasets import DatasetDict, load_dataset
+
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class SquadDataModule(FineTuningDataModule):
+    """A data module for fine-tuning on the Squad dataset.
+
+    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
+    Stanford Question Answering Dataset (SQuAD). It handles data download, preprocessing, splitting, and preparing the data
+    in a format suitable for training, validation, and testing.
+
+    Args:
+        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
+        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
+        See FineTuningDataModule for the other args
+    """
+
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        super().__init__(
+            dataset_root=get_dataset_root("squad"),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+
+    def prepare_data(self) -> None:
+        # if train file is specified, no need to do anything
+        if self.train_path.exists() and not self.force_redownload:
+            return
+
+        dset = self._download_data()
+        self._preprocess_and_split_data(dset)
+
+    def _download_data(self):
+        logging.info(f"Downloading {self.__class__.__name__}...")
+        return load_dataset(
+            "squad",
+            cache_dir=str(self.dataset_root),
+            download_mode="force_redownload" if self.force_redownload else None,
+        )
+
+    def _preprocess_and_split_data(
+        self, dset: DatasetDict, split_val_from_train: bool = True, val_proportion: float = 0.05
+    ):
+        """Preprocesses and splits the downloaded dataset into training, validation, and test sets.
+
+        Args:
+            dset (DatasetDict): The downloaded dataset object.
+            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
+                If False, the validation set is split from the test set. Defaults to True.
+            val_proportion (float, optional): The proportion of the training or test set to be used for the validation split.
+                Defaults to 0.05.
+        """
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+        save_splits = {}
+        train_set = dset.get('train')
+        val_set = dset.get('validation')
+
+        if split_val_from_train:
+            split_dataset = train_set.train_test_split(test_size=val_proportion, seed=self.seed)
+            save_splits['training'] = split_dataset['train']
+            save_splits['validation'] = split_dataset['test']
+            save_splits['test'] = val_set
+        else:
+            split_dataset = val_set.train_test_split(test_size=val_proportion, seed=self.seed)
+            save_splits['training'] = train_set
+            save_splits['validation'] = split_dataset['test']
+            save_splits['test'] = split_dataset['train']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+
+            with output_file.open("w", encoding="utf-8") as f:
+                for example in dataset:
+                    json_line = {}
+                    # Write each example as a JSON line in the output file
+                    json_line["input"] = (
+                        "Context: " + example["context"] + " Question: " + example['question'] + " Answer:"
+                    )
+                    json_line["output"] = example["answers"]["text"][0]
+                    if split_name == "test":
+                        json_line["original_answers"] = example["answers"]["text"]
+                    f.write(json.dumps(json_line) + "\n")
+
+            logging.info(f"{split_name} split saved to {output_file}")
+
+        if self.delete_raw:
+            for p in self.dataset_root.iterdir():
+                if p.is_dir():
+                    shutil.rmtree(p)
+                elif '.jsonl' not in str(p.name):
+                    p.unlink()
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
new file mode 100644
index 000000000000..fcb78d6cd397
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -0,0 +1,18 @@
+from nemo.collections.llm.gpt.model.base import (
+    GPTConfig,
+    GPTModel,
+    MaskedTokenLossReduction,
+    gpt_data_step,
+    gpt_forward_step,
+)
+from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
+
+__all__ = [
+    "GPTConfig",
+    "GPTModel",
+    "Mistral7BConfig",
+    "Mistral7BModel",
+    "MaskedTokenLossReduction",
+    "gpt_data_step",
+    "gpt_forward_step",
+]
diff --git a/nemo/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
similarity index 96%
rename from nemo/llm/gpt/model/base.py
rename to nemo/collections/llm/gpt/model/base.py
index 02588b494077..2bd15d03cc95 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -7,7 +7,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch.optim import Optimizer
 
-from nemo.lightning import get_vocab_size
+from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
 if TYPE_CHECKING:
@@ -22,7 +22,9 @@ class GPTConfig(TransformerConfig):
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
     share_embeddings_and_output_weights: bool = False
+    make_vocab_size_divisible_by: int = 128
     position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
+    rotary_base: int = 10000
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
     seq_length: int = 1024
@@ -47,20 +49,21 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         return MCoreGPTModel(
             self,
             transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
-            vocab_size=get_vocab_size(self, tokenizer.vocab_size),
+            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
             parallel_output=self.parallel_output,
             share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
             position_embedding_type=self.position_embedding_type,
             rotary_percent=self.rotary_percent,
+            rotary_base=self.rotary_base,
             seq_len_interpolation_factor=self.seq_len_interpolation_factor,
             pre_process=parallel_state.is_pipeline_first_stage(),
             post_process=parallel_state.is_pipeline_last_stage(),
         )
 
 
-class GPTModel(L.LightningModule):
+class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin):
     def __init__(
         self,
         config: GPTConfig,
@@ -122,9 +125,6 @@ def training_loss_reduction(self) -> MaskedTokenLossReduction:
     def validation_loss_reduction(self) -> MaskedTokenLossReduction:
         return MaskedTokenLossReduction(validation_step=True)
 
-    def copy(self) -> "GPTModel":
-        return self.__class__(self.config, self.tokenizer)
-
 
 def gpt_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     from megatron.core import parallel_state
diff --git a/nemo/collections/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py
new file mode 100644
index 000000000000..e0035a086fbe
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/mistral_7b.py
@@ -0,0 +1,267 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+import torch
+import torch.nn.functional as F
+
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.lightning import io, teardown
+
+if TYPE_CHECKING:
+    from transformers import MistralConfig, MistralForCausalLM
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+
+@dataclass
+class Mistral7BConfig(GPTConfig):
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    gated_linear_unit: bool = True
+    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?
+
+    num_layers: int = 32
+    hidden_size: int = 4096
+    num_attention_heads: int = 32
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 14336
+    seq_length: int = 32768
+
+    init_method_std: float = 0.02
+    layernorm_epsilon: float = 1e-5
+    window_size: List[int] = field(default_factory=lambda: [4096, 0])
+
+
+class Mistral7BModel(GPTModel):
+    def __init__(self, config: Optional[Mistral7BConfig] = None, tokenizer=None):
+        _tokenizer = tokenizer or HFMistral7BImporter().tokenizer
+
+        super().__init__(config or Mistral7BConfig(), _tokenizer)
+
+
+@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")
+class HFMistral7BImporter(io.ModelConnector["MistralForCausalLM", Mistral7BModel]):
+    def init(self) -> Mistral7BModel:
+        return Mistral7BModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import MistralForCausalLM
+
+        source = MistralForCausalLM.from_pretrained(str(self))
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self))
+
+    @property
+    def config(self) -> Mistral7BConfig:
+        from transformers import MistralConfig
+
+        source = MistralConfig.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(mistral_vocab_size):
+            base = 128
+            while mistral_vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = Mistral7BConfig(
+            seq_length=source.sliding_window,
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.max_position_embeddings,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            num_query_groups=source.num_key_value_heads,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            window_size=[source.sliding_window, 0],
+        )
+
+        return output
+
+
+@io.model_exporter(Mistral7BModel, "hf")
+class HFMistral7BExporter(io.ModelConnector[Mistral7BModel, "MistralForCausalLM"]):
+    def init(self) -> "MistralForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # TODO: Make it work with lazy init
+        # with torch.device("meta"):
+        #     target = self.init()
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        # TODO: Make sure we don't need to do this
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+
+    @property
+    def tokenizer(self):
+        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "MistralConfig":
+        source: Mistral7BConfig = io.load_ckpt(str(self)).model.config
+
+        from transformers import MistralConfig
+
+        return MistralConfig(
+            sliding_window=source.window_size[0],
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
new file mode 100644
index 000000000000..848a83f5dc08
--- /dev/null
+++ b/nemo/collections/llm/utils.py
@@ -0,0 +1,16 @@
+from typing import Any, Callable, TypeVar
+
+T = TypeVar('T', bound=Callable[..., Any])
+
+
+def task(*args: Any, **kwargs: Any) -> Callable[[T], T]:
+    try:
+        import nemo_sdk as sdk
+
+        return sdk.task(*args, **kwargs)
+    except ImportError:
+        # Return a no-op function
+        def noop_decorator(func: T) -> T:
+            return func
+
+        return noop_decorator
diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index d51a5f973f99..43b1977aa993 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import dataclasses
+from collections import defaultdict
 from enum import Enum, auto
 from typing import List
 
@@ -20,12 +21,18 @@
 DEFAULT_EOS_TOKEN = "<extra_id_7>"
 DEFAULT_UNK_TOKEN = "<unk>"
 DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
 DEFAULT_SYSTEM_TOKEN = "<extra_id_0>"
 DEFAULT_SEPARATOR_TOKEN = "<extra_id_1>"
 DEFAULT_LABELS_TOKEN = "<extra_id_2>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<extra_id_3>"
-DEFAULT_IM_START_TOKEN = "<extra_id_4>"
-DEFAULT_IM_END_TOKEN = "<extra_id_5>"
+DEFAULT_IMAGE_PATCH_TOKEN = defaultdict(lambda: "<extra_id_3>")
+DEFAULT_IM_START_TOKEN = defaultdict(lambda: "<extra_id_4>")
+DEFAULT_IM_END_TOKEN = defaultdict(lambda: "<extra_id_5>")
+
+# Update llama3 default
+DEFAULT_IMAGE_PATCH_TOKEN["llama_3"] = "<|reserved_special_token_3|>"
+DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>"
+DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>"
 
 
 class SeparatorStyle(Enum):
@@ -35,6 +42,7 @@ class SeparatorStyle(Enum):
     TWO = auto()
     PLAIN = auto()
     LLAMA_2 = auto()
+    LLAMA_3 = auto()
     NVGPT = auto()
 
 
@@ -108,6 +116,34 @@ def get_prompt(self):
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            """
+            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+            {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+            {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+            """
+            wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}"
+            wrap_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>\n\n{msg}"
+            wrap_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg}"
+
+            ret = "<|begin_of_text|>" + wrap_sys(self.system) + self.sep
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if type(message) is tuple:
+                    message, _, _ = message
+                elif i % 2 == 0:
+                    ret += wrap_user(message) + self.sep
+                else:
+                    ret += wrap_assistant(message) + (self.sep if message else "")
+
         elif self.sep_style == SeparatorStyle.PLAIN:
             seps = [self.sep, self.sep2]
             ret = self.system
@@ -345,8 +381,25 @@ def dict(self):
     sep2=DEFAULT_EOS_TOKEN,
 )
 
+conv_llava_llama_3 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+
 conv_llava_plain = Conversation(
-    system="", roles=("", ""), messages=(), offset=0, sep_style=SeparatorStyle.PLAIN, sep="\n",
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
 )
 
 conv_llava_v0 = Conversation(
@@ -415,6 +468,5 @@ def dict(self):
     "nv_dpo": conv_nv_dpo,
 }
 
-
 if __name__ == "__main__":
     print(default_conversation.get_prompt())
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 71d9bda12de1..86d45ded54cf 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -18,8 +18,10 @@
 import re
 import tarfile
 from dataclasses import dataclass
-from typing import Any, Dict, List, Sequence, Union
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
+import decord
+import numpy as np
 import torch
 import torch.nn.functional as F
 import transformers
@@ -27,28 +29,32 @@
 from omegaconf import DictConfig
 from PIL import Image
 from torch.utils.data import Dataset, default_collate
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, SiglipImageProcessor
 
 import nemo.collections.multimodal.data.neva.conversation as conversation_lib
 from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
 from nemo.collections.multimodal.data.neva.conversation import (
-    DEFAULT_BOS_TOKEN,
-    DEFAULT_EOS_TOKEN,
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
     DEFAULT_IMAGE_PATCH_TOKEN,
     DEFAULT_IMAGE_TOKEN,
     DEFAULT_LABELS_TOKEN,
-    DEFAULT_PAD_TOKEN,
-    DEFAULT_SEPARATOR_TOKEN,
-    DEFAULT_SYSTEM_TOKEN,
-    DEFAULT_UNK_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 
 MAX_NUM_IMAGES = 1
 IGNORE_INDEX = -1
 
+try:
+    from megatron.core.datasets.indexed_dataset import IndexedDataset
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
 
 class TarOrFolderImageLoader:
     """
@@ -94,8 +100,90 @@ def open_image(self, file_name):
         return None
 
 
+class TarOrFolderVideoLoader:
+    """
+    A class for loading videos from a tar archive or a regular folder.
+
+    This class provides functionality to open and read videos from either a tar archive
+    (.tar file) or a standard directory with video files. It builds an index of videos
+    if the source is a tar archive for efficient access.
+
+    Attributes:
+        video_folder (str): The path to the tar archive or video folder.
+        data_cfg (dict): A dictionary of configuration options for video decoding to frames
+        tar_index (dict): A dictionary that maps file names to their tarfile member
+                          objects if the video source is a tar archive.
+
+    Methods:
+        __init__(self, video_folder): Initializes the loader with the specified video folder.
+        build_index(self): Builds an index of image file names and their corresponding
+                           tarfile member objects for a tar archive.
+        open_video(self, file_name): Opens and returns an video by its file name. The video
+                                     is returned as a list of RGB PIL Image objects.
+        flatten_frames(self, cap): Converts decord VideoReader video object to list of frame
+                                   images based on data config information.
+    """
+
+    def __init__(self, video_folder, data_cfg):
+        self.video_folder = video_folder
+        self.data_cfg = data_cfg
+        self.tar_index = {}
+        if self.video_folder.endswith('.tar'):
+            self.build_index()
+
+    def build_index(self):
+        with tarfile.open(self.video_folder, 'r') as tar:
+            for member in tar.getmembers():
+                self.tar_index[member.name] = member
+
+    def open_video(self, file_name):
+        if self.video_folder.endswith('.tar'):
+            with tarfile.open(self.video_folder, 'r') as tar:
+                member = self.tar_index.get(file_name)
+                if member:
+                    f = tar.extractfile(member)
+                    cap = decord.VideoReader(f)
+                    return self.flatten_frames(cap)
+        else:
+            decord.bridge.set_bridge("torch")
+            cap = decord.VideoReader(os.path.join(self.video_folder, file_name))
+            return self.flatten_frames(cap)
+        return None
+
+    def flatten_frames(self, cap):
+        if self.data_cfg['splice_single_frame'] == 'first':
+            frame = cap[0].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_cfg['splice_single_frame'] == 'middle':
+            frame = cap[len(cap) // 2].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_cfg['splice_single_frame'] == 'last':
+            frame = cap[-1].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        else:
+            if self.data_cfg['num_frames'] == -1:
+                frames = []
+                for frame in cap:
+                    rgb_frame = frame.asnumpy()
+                    img = Image.fromarray(rgb_frame).convert('RGB')
+                    frames.append(img)
+                return frames
+            else:
+                num_frames = min(len(cap), self.data_cfg['num_frames'])
+                indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int)
+                frames = []
+                frames = cap.get_batch(indices)
+
+                while len(frames) < self.data_cfg['num_frames']:
+                    frames.append(frames[-1])
+                return frames
+
+
 def tokenize(
-    texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int,
+    texts: Union[str, List[str]],
+    tokenizer: Any,
+    context_length: int,
+    add_extra_token: int,
 ) -> torch.LongTensor:
     """
     Returns the tokenized representation of given input string(s). If the list of tokens exceeds the context
@@ -123,7 +211,7 @@ def tokenize(
     if isinstance(texts, str):
         texts = [texts]
         texts_is_str = True
-    tokens = tokenizer.text_to_ids(texts)
+    tokens = [tokenizer.text_to_ids(t) for t in texts]
     max_len = max([len(token) for token in tokens])
     context_length = min(max_len - add_extra_token, context_length)
     # truncate and padding
@@ -158,38 +246,187 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     - dict: The processed sources dictionary after applying multimodal preprocessing steps.
     """
     is_multimodal = multimodal_cfg['is_multimodal']
+    model_type = multimodal_cfg['model_type']
+    media_type = multimodal_cfg['media_type']
     image_token_len = cur_token_len
+    if media_type == 'image':
+        default_token = DEFAULT_IMAGE_TOKEN
+    elif media_type == 'video':
+        default_token = DEFAULT_VIDEO_TOKEN
+    else:
+        return sources
+
     if not is_multimodal:
         return sources
 
+    num_patches = image_token_len
+
+    if media_type == 'video':
+        num_patches *= multimodal_cfg['num_frames']
+
+    if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+        num_patches //= 4
+
     if multimodal_cfg['use_im_start_end']:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches
     else:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (image_token_len - 2)
-    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * (num_patches - 2)
+    replace_token = DEFAULT_IM_START_TOKEN[model_type] + replace_token + DEFAULT_IM_END_TOKEN[model_type]
 
     for source in sources:
         conversation = source['conversations']
         if multimodal_cfg['sep_image_conv_front']:
-            assert DEFAULT_IMAGE_TOKEN in conversation[0]['value']
-            conversation[0]['value'] = conversation[0]['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+            assert default_token in conversation[0]['value']
+            conversation[0]['value'] = conversation[0]['value'].replace(default_token, '').strip()
             conversation[0]['value'] = (
-                DEFAULT_IMAGE_TOKEN
+                default_token
                 + conversation_lib.default_conversation.sep
                 + conversation_lib.default_conversation.roles[0]
                 + ": "
                 + conversation[0]['value']
             )
         if use_plain:
-            assert DEFAULT_IMAGE_TOKEN in conversation[0]['value']
-            conversation[0]['value'] = DEFAULT_IMAGE_TOKEN
+            assert default_token in conversation[0]['value']
+            conversation[0]['value'] = default_token
         for turn in conversation:
-            turn["value"] = turn["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+            turn["value"] = turn["value"].replace(default_token, replace_token)
 
     return sources
 
 
-def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
+def process_image(processor, image, image_aspect_ratio="square"):
+    if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
+        # image processor from HF
+        if image_aspect_ratio == 'keep':
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            max_len, min_len = 448, 224
+            shortest_edge = int(min(max_len / aspect_ratio, min_len))
+            image = processor.preprocess(
+                image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+            )['pixel_values'][0]
+        elif image_aspect_ratio == 'pad':
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        else:
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+    else:
+        assert image_aspect_ratio == 'square', 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
+        image = processor(image)
+    return image
+
+
+def preprocess_llama_3(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
+    """
+    Preprocesses sources for the LLaMA 3 model configuration.
+
+    The function applies prompt templates and tokenizes the conversations according to the LLaMA 2 model specifications.
+    It involves special handling of tokens, masking of labels, and adjustments based on configuration settings.
+
+    Parameters:
+    - sources (dict): A dictionary of sources containing conversations to be processed.
+    - tokenizer: The tokenizer to be used for processing the text.
+    - cfg: Configuration settings for preprocessing, including context length and additional tokens.
+
+    Returns:
+    - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
+      This includes tokens, labels, and any special processing as defined in the configuration.
+    """
+    conv = conversation_lib.conv_llava_llama_3.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        source = source['conversations']
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    add_extra_token = cfg.get("add_extra_token")
+
+    # Tokenize conversations
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+    labels = tokens.clone().detach()
+    # Mask labels
+    sep = "<|start_header_id|>assistant<|end_header_id|>\n\n"  # part sep
+    round_sep = "<|start_header_id|>user<|end_header_id|>\n\n"
+    for conversation, target in zip(conversations, labels):
+        # the first match of round sep is going to be the one after system, which is not the intended behavior
+        rounds = conversation.split(round_sep)
+        rounds = [round_sep.join(rounds[:2])] + rounds[2:]
+        cur_len = 0
+        for i, rou in enumerate(rounds):
+
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if i == 0:
+                round_len = len(tokenizer.text_to_ids(rou))
+                instruction_len = len(tokenizer.text_to_ids(parts[0]))
+            else:
+                round_len = len(tokenizer.text_to_ids(round_sep + rou))
+                instruction_len = len(tokenizer.text_to_ids(round_sep + parts[0]))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+    # Check if masking working correctly
+    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
+
+
+def preprocess_llama_2(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
 
@@ -255,9 +492,11 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
             parts[0] += sep
 
             round_len = len(tokenizer.text_to_ids(rou + conv.sep2))
+            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
             if i > 0:
                 round_len -= 1  # Remove extra token added by sp tokenizer
-            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+            else:
+                instruction_len += 1
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 
             cur_len += round_len
@@ -273,10 +512,17 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_v1(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the Vicuna V1 model configuration.
 
@@ -356,10 +602,17 @@ def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nvgpt(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -397,9 +650,9 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
                 if 'label' not in turn:
-                    turn[
-                        'label'
-                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    turn['label'] = (
+                        "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    )
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:
@@ -461,10 +714,17 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nv_dpo(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -501,7 +761,11 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
 
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
-                conv.append_message(turn['from'], turn['value'])
+                if "label" in turn:
+                    value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
+                else:
+                    value = turn["value"]
+                conv.append_message(turn['from'], value)
                 if not turn["value"]:
                     strip_end_for_inference = (
                         True  # in inference, current turn is empty, thus end tokens need to striped.
@@ -543,7 +807,11 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
             if len(parts) != 2:
                 break
 
-            instruction_len = len(tokenizer.text_to_ids(parts[0] + sep))
+            # handle label if exists
+            labels_match = re.search(rf"{re.escape(DEFAULT_LABELS_TOKEN)}.*?\n", parts[1])
+            instruction_len = len(
+                tokenizer.text_to_ids(parts[0] + sep + (parts[1][: labels_match.end()] if labels_match else ""))
+            )
             round_len = len(tokenizer.text_to_ids(rou + conv.sep))
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 
@@ -560,10 +828,17 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
+def preprocess_plain(
+    sources,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses plain text sources (no template) for tokenization and label generation.
 
@@ -611,17 +886,18 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
 class LazySupervisedDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
-    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
+    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict):
         super(LazySupervisedDataset, self).__init__()
-        logging.warning("Loading data...")
         if data_path is not None:
-            logging.warning("Loading data...")
             with open(data_path, "r") as file:
                 list_data_dict = json.load(file)
         else:
@@ -633,9 +909,11 @@ def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
         self.multimodal_cfg = multimodal_cfg
         self.conv_template = multimodal_cfg["conv_template"]
         self.image_folder = multimodal_cfg['image_folder']
+        self.video_folder = multimodal_cfg['video_folder']
         self.processor = multimodal_cfg["image_processor"]
 
-        self.image_loader = TarOrFolderImageLoader(self.image_folder)
+        self.image_loader = TarOrFolderImageLoader(self.image_folder) if self.image_folder else None
+        self.video_loader = TarOrFolderVideoLoader(self.video_folder, data_cfg) if self.video_folder else None
 
     def __len__(self):
         return len(self.list_data_dict)
@@ -654,16 +932,49 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
                 image = self.image_loader.open_image(image_file)
                 if image is None:
                     logging.warning(f"Image {image_file} could not be found!")
+                image = process_image(self.processor, image, self.multimodal_cfg['image_aspect_ratio'])
+                images.append(image)
+            media_tensors = torch.tensor([])
+            if images:
+                media_tensors = torch.stack(images)
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[1] // patch_dim
+                width_num_patches = media_tensors[0].shape[2] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
+                sources = preprocess_multimodal(
+                    copy.deepcopy(sources),
+                    self.multimodal_cfg,
+                    cur_token_len,
+                    use_plain=(self.conv_template == "plain"),
+                )
+        elif 'video' in sources[0]:
+            if not isinstance(self.list_data_dict[i]['video'], list):
+                self.list_data_dict[i]['video'] = [self.list_data_dict[i]['video']]
+
+            videos = []
+            for video_file in self.list_data_dict[i]['video']:
+                frames = self.video_loader.open_video(video_file)
+                if frames is None:
+                    logging.warning(f"Video {video_file} could not be found!")
                 if isinstance(self.processor, CLIPImageProcessor):
                     # image processor from HF
                     if self.multimodal_cfg['image_aspect_ratio'] == 'keep':
-                        max_hw, min_hw = max(image.size), min(image.size)
+                        max_hw, min_hw = max(frames.size), min(frames.size)
                         aspect_ratio = max_hw / min_hw
                         max_len, min_len = 448, 224
                         shortest_edge = int(min(max_len / aspect_ratio, min_len))
-                        image = self.processor.preprocess(
-                            image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
-                        )['pixel_values'][0]
+                        frames = self.processor.preprocess(
+                            frames, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+                        )['pixel_values']
                     elif self.multimodal_cfg['image_aspect_ratio'] == 'pad':
 
                         def expand2square(pil_img, background_color):
@@ -679,42 +990,79 @@ def expand2square(pil_img, background_color):
                                 result.paste(pil_img, ((height - width) // 2, 0))
                                 return result
 
-                        image = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
-                        image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                        frames = expand2square(frames, tuple(int(x * 255) for x in self.processor.image_mean))
+                        frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
                     else:
-                        image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                        frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
                 else:
                     assert (
                         self.multimodal_cfg['image_aspect_ratio'] == 'square'
                     ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
-                    image = self.processor(image)
-                images.append(image)
-            images_tensors = torch.tensor([])
-            if images:
-                images_tensors = torch.stack(images)
-                cur_token_len = (images_tensors[0].shape[1] // 14) * (
-                    images_tensors[0].shape[2] // 14
-                )  # FIXME: 14 is hardcoded patch size
+                    frames = self.processor(frames)
+                videos.append(frames)
+            media_tensors = frames
+            if videos:
+                media_tensors = torch.stack(videos)
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[-2] // patch_dim
+                width_num_patches = media_tensors[0].shape[-1] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
                     self.multimodal_cfg,
                     cur_token_len,
                     use_plain=(self.conv_template == "plain"),
                 )
+
         else:
-            images_tensors = torch.tensor([])
+            media_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
         if self.conv_template in ["nvgpt", "nv_steerlm"]:
-            data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nvgpt(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "nv_dpo":
-            data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nv_dpo(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "v1":
-            data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_v1(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "llama_2":
-            data_dict = preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
+        elif self.conv_template == "llama_3":
+            data_dict = preprocess_llama_3(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "plain":
-            data_dict = preprocess_plain(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_plain(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         else:
             raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
 
@@ -727,47 +1075,55 @@ def expand2square(pil_img, background_color):
                 crop_size = [self.processor.crop_size['height'], self.processor.crop_size['width']]
             else:
                 crop_size = self.multimodal_cfg['crop_size']
-            # image does not exist in the data, but the model is multimodal
-            zero_padding = torch.zeros(
-                (MAX_NUM_IMAGES - len(images_tensors), 3, crop_size[0], crop_size[1]), dtype=torch.float
-            )
-            images_tensors = torch.cat((images_tensors, zero_padding), dim=0)
-            data_dict['image'] = images_tensors
+
+            # Image does not exist in the data, but the model is multimodal
+            # TODO, if there are different videos on T dimensions.
+            if media_tensors.shape[0] < MAX_NUM_IMAGES:
+                padding_size = MAX_NUM_IMAGES - media_tensors.shape[0]
+                zero_padding = torch.zeros((padding_size, 3, crop_size[0], crop_size[1]), dtype=torch.float)
+                media_tensors = torch.cat((media_tensors, zero_padding), dim=0)
+
+            if self.multimodal_cfg['media_type'] == 'image':
+                data_dict['image'] = media_tensors
+            elif self.multimodal_cfg['media_type'] == 'video':
+                data_dict['video'] = media_tensors
+
         return data_dict
 
 
 class NevaDataset(LazySupervisedDataset):
     """Dataset for supervised fine-tuning."""
 
-    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
+    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict):
 
         if data_path.endswith(".json"):
-            super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg)
+            super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg, data_cfg)
 
         elif data_path.endswith(".jsonl"):
-            super(NevaDataset, self).__init__(None, tokenizer, multimodal_cfg)
+            super(NevaDataset, self).__init__(None, tokenizer, multimodal_cfg, data_cfg)
             logging.warning("Loading image inputs from SteerLM Dataset")
-            image_folder = multimodal_cfg['image_folder']
-            for line in open(data_path, "r"):
-                record = json.loads(line)
-
-                # This currently supports only a single image
-                # search for <img src="/absolute/path/to/image" in the conversation
-                #   add it as record['image'], remove src tag from the <img> tag
-
-                record['image'] = []
-                for turn in record['conversations']:
-                    matches = re.finditer('<img src="([^"]+)"', turn['value'])
-                    for match in matches:
-                        image_name = match.group(1).split("/")[-1]
-                        image_path = os.path.join(image_folder, image_name)
-                        if not os.path.isfile(image_path):
-                            logging.warning(f"Image not found: {image_path}")
-                            continue
-                        record['image'].append(image_name)  # url
-                    turn['value'] = re.sub('<img src="([^"]+)">', DEFAULT_IMAGE_TOKEN, turn['value'])
-
-                self.list_data_dict.append(record)
+            if multimodal_cfg['media_type'] == 'image':
+                image_folder = multimodal_cfg['image_folder']
+                for line in open(data_path, "r"):
+                    record = json.loads(line)
+
+                    # This currently supports only a single image
+                    # search for <img src="/absolute/path/to/image" in the conversation
+                    #   add it as record['image'], remove src tag from the <img> tag
+
+                    record['image'] = []
+                    for turn in record['conversations']:
+                        matches = re.finditer('<img src="([^"]+)"', turn['value'])
+                        for match in matches:
+                            image_name = match.group(1).split("/")[-1]
+                            image_path = os.path.join(image_folder, image_name)
+                            if not os.path.isfile(image_path):
+                                logging.warning(f"Image not found: {image_path}")
+                                continue
+                            record['image'].append(image_name)  # url
+                        turn['value'] = re.sub('<img src="([^"]+)">', DEFAULT_IMAGE_TOKEN, turn['value'])
+
+                    self.list_data_dict.append(record)
 
         else:
             raise ValueError(f"Formatting of {data_path} is not supported in Neva.")
@@ -781,12 +1137,27 @@ class DataCollatorForSupervisedDataset(object):
     tokenizer: transformers.PreTrainedTokenizer
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        packed_sequence = "cu_seqlens" in instances[0]
         max_len = max(instance['tokens'].shape[0] for instance in instances)
         max_len = (max_len - 1) // 64 * 64 + 64
         for instance in instances:
             pad_len = max_len - instance['tokens'].shape[0]
             instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
             instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', -1)
+            if packed_sequence and instance["cu_seqlens"][-1] != max_len:
+                instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0)
+
+        if packed_sequence:
+            max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances)
+            max_len_image = max(instance['image'].shape[0] for instance in instances)
+            for instance in instances:
+                pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0]
+                instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len)
+
+                x = instance['image']
+                num_pad = max_len_image - x.shape[0]
+                pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device)
+                instance['image'] = torch.cat((x, pad_tensor), dim=0)
 
         batch = default_collate(instances)
         tokenizer = self.tokenizer
@@ -794,15 +1165,33 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         tokens = batch['tokens']
         labels = batch['labels']
-        media = batch.get('image')
-
-        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            data=tokens,
-            eod_token=tokenizer.eos_id,
-            eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
-            reset_attention_mask=False,
-            reset_position_ids=False,
-        )
+        media_type = model_cfg.data.get('media_type', 'image')
+        if media_type == 'image':
+            media = batch.get('image')
+        elif media_type == 'video':
+            media = batch.get('video')
+        else:
+            raise ValueError(f"Unsupported media type {media_type}")
+
+        if packed_sequence:
+            cu_seqlens = batch["cu_seqlens"]
+            position_ids = []
+            for cu_seqlen in cu_seqlens:
+                position_ids.append([])
+                for ind in range(0, len(cu_seqlen) - 1):
+                    seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind]
+                    position_ids[-1].extend(list(range(seqlen)))
+            position_ids = torch.LongTensor(position_ids)
+            loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device)
+            attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device)
+        else:
+            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                data=tokens,
+                eod_token=tokenizer.eos_id,
+                eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
+                reset_attention_mask=False,
+                reset_position_ids=False,
+            )
 
         loss_mask[labels == -1] = 0.0
         tokens[tokens == -1] = 0
@@ -811,7 +1200,10 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         if media is None:
             raise NotImplementedError
         else:
-            media = rearrange(media, "b T c h w -> b T 1 c h w")
+            if media_type == 'image':
+                media = rearrange(media, "b T c h w -> b T 1 c h w")
+            elif media_type == 'video':
+                media = rearrange(media, "b T F c h w -> b T F c h w")
 
         batch = {
             'tokens': tokens,
@@ -821,24 +1213,19 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
             'position_ids': position_ids,
             'media': media,
         }
+        if packed_sequence:
+            batch["cu_seqlens"] = cu_seqlens
         return batch
 
 
-def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
+def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
     data_cfg = model_cfg.data
     mm_cfg = model_cfg.mm_cfg
     add_extra_token = 1
     if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
         add_extra_token = 0
-    crop_size = data_cfg.get("crop_size", (224, 224))
-    if mm_cfg.vision_encoder.from_hf:
-        image_processor = CLIPImageProcessor.from_pretrained(
-            mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
-        )
-    else:
-        # TODO(yuya): Fix this hard-code for our own CLIP
-        image_processor = image_transform(crop_size, is_train=False, mean=None, std=None,)
+    crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
 
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
@@ -846,16 +1233,46 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         multimodal_cfg=dict(
             is_multimodal=data_cfg.is_multimodal,
             sep_image_conv_front=data_cfg.sep_image_conv_front,
+            model_type=mm_cfg.llm.get("model_type", "nvgpt"),
             conv_template=data_cfg.get("conv_template", "nvgpt"),
+            patch_dim=model_cfg.mm_cfg.vision_encoder.patch_dim,
             crop_size=crop_size,
-            image_token_len=data_cfg.image_token_len,
-            image_folder=data_cfg.image_folder,
+            image_folder=data_cfg.get('image_folder', None),
+            video_folder=data_cfg.get('video_folder', None),
             image_aspect_ratio=data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(model_cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=image_processor,
             add_extra_token=add_extra_token,
             context_length=model_cfg.encoder_seq_length,
+            media_type=data_cfg.get('media_type', 'image'),
+            num_frames=data_cfg.get('num_frames', -1),
+            mm_mlp_adapter_type=model_cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'),
+        ),
+        data_cfg=dict(
+            splice_single_frame=data_cfg.get('splice_single_frame', None),
+            num_frames=data_cfg.get('num_frames', -1),
+            sep_token_between_frames=data_cfg.get('sep_token_between_frames', False),
         ),
     )
 
     return dict(train_dataset=train_dataset, eval_dataset=train_dataset)
+
+
+class NevaPackedSeqDatatset(Dataset):
+    def __init__(self, data_path: str, crop_size: Tuple[int, int] = (224, 224)):
+        self.ds = IndexedDataset(data_path)
+        self.crop_size = crop_size
+
+    def __len__(self):
+        return len(self.ds.document_indices) - 1
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        doc_start = self.ds.document_indices[i]
+        batch = {
+            "cu_seqlens": torch.IntTensor(self.ds[doc_start]),
+            "tokens": torch.LongTensor(self.ds[doc_start + 1]),
+            "labels": torch.LongTensor(self.ds[doc_start + 2]),
+            "image": torch.FloatTensor(self.ds[doc_start + 3]).reshape(-1, 3, *self.crop_size),
+        }
+
+        return batch
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index cff8ab1a7b5f..cce40da45725 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -21,20 +21,22 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from omegaconf.dictconfig import DictConfig
+from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
-from transformers import CLIPVisionModel
+from transformers import CLIPVisionModel, SiglipVisionModel
 
 from nemo.collections.common.parts.utils import extend_instance
 from nemo.collections.multimodal.data.neva.conversation import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
 from nemo.collections.multimodal.data.neva.neva_dataset import (
     DataCollatorForSupervisedDataset,
+    NevaPackedSeqDatatset,
     make_supervised_data_module,
 )
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
     CLIPVisionTransformer,
     MegatronCLIPModel,
 )
-from nemo.collections.multimodal.parts.utils import load_nemo_model_weights
+from nemo.collections.multimodal.parts.utils import create_image_processor, load_nemo_model_weights
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, get_specs
@@ -43,7 +45,10 @@
     AdapterName,
     MultimodalProjectorAdapterConfig,
 )
-from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    get_iterator_k_split,
+)
 from nemo.collections.nlp.modules.common.text_generation_utils import (
     generate,
     get_computeprob_response,
@@ -61,6 +66,7 @@
 
 try:
     import apex.transformer.pipeline_parallel.utils
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     HAVE_APEX = True
 
@@ -71,6 +77,8 @@
 try:
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
     HAVE_MEGATRON_CORE = True
 
@@ -84,7 +92,11 @@ class FrozenCLIPVisionTransformer(CLIPVisionTransformer):
 
     def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_process=True):
         super().__init__(
-            model_cfg, model_parallel_config, pre_process=pre_process, post_process=post_process, skip_head=True,
+            model_cfg,
+            model_parallel_config,
+            pre_process=pre_process,
+            post_process=post_process,
+            skip_head=True,
         )
         self.frozen = False
         self.dtype = self.config.params_dtype
@@ -129,7 +141,7 @@ def init_vision(
         use_im_start_end=False,
     ):
         self.vision_encoder = vision_encoder
-        self.from_hf = isinstance(vision_encoder, CLIPVisionModel)
+        self.from_hf = isinstance(vision_encoder, CLIPVisionModel) or isinstance(vision_encoder, SiglipVisionModel)
         self.media_start_id = media_start_id
         self.media_end_id = media_end_id
         self.class_token_length = class_token_length
@@ -161,7 +173,6 @@ def encode_vision_x(self, vision_x: torch.Tensor):
 
         assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
         b, T, F = vision_x.shape[:3]
-        assert F == 1, "Only single frame supported"
 
         vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
         vision_x = vision_x.to(self.vision_encoder.dtype)
@@ -188,7 +199,7 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media):
         # calculate media features without gradients
         media_features = self.encode_vision_x(media)  # b T F S(eq) H(idden)
         num_images_per_sample = media_features.size(1)
-        num_patches = media_features.size(3)
+        num_patches = media_features.size(3) * media_features.size(2)
         # flatten patches
         media_features = media_features.view(batch_size, -1, hidden_size)
 
@@ -229,6 +240,15 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media):
 
         return updated_input_embeds
 
+    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), **kwargs):
+        sharded_state_dict = super().sharded_state_dict(prefix=prefix, sharded_offsets=sharded_offsets, **kwargs)
+
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        state_dict.pop('weight')
+        # duplicate everything else
+        sharded_state_dict.update(make_sharded_tensors_for_checkpoint(state_dict, prefix=prefix))
+        return sharded_state_dict
+
 
 class NevaBaseModel:
     """
@@ -239,7 +259,12 @@ class NevaBaseModel:
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         self.mm_cfg = mm_cfg
         self.media_start_id = media_start_id
@@ -255,24 +280,7 @@ def __init__(
         if mm_cfg.llm.freeze:
             self.freeze_llm(mm_cfg)
 
-        # Initialize vision encoder and freeze it
-        if mm_cfg.vision_encoder.from_hf:
-            vision_encoder = CLIPVisionModel.from_pretrained(
-                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16,
-            ).cuda()
-            vision_encoder = vision_encoder.to(torch.bfloat16)
-            if mm_cfg.vision_encoder.freeze:
-                for param in vision_encoder.parameters():
-                    param.requires_grad = False
-                vision_encoder = vision_encoder.eval()
-        else:
-            vision_cfg = MegatronCLIPModel.restore_from(
-                mm_cfg.vision_encoder.from_pretrained, return_config=True
-            ).vision
-            vision_encoder = FrozenCLIPVisionTransformer(vision_cfg, self.config)
-            self.load_vision_encoder_weights(vision_encoder, mm_cfg.vision_encoder.from_pretrained)
-            if mm_cfg.vision_encoder.freeze:
-                vision_encoder.freeze()
+        vision_encoder, self.image_processor = self.create_vision_encoder_and_processor(mm_cfg)
 
         # Monkey patch embedding
         if kwargs.get("pre_process", True):
@@ -286,6 +294,44 @@ def __init__(
                 use_im_start_end=mm_cfg.get("use_im_start_end", False),
             )
 
+    def create_vision_encoder_and_processor(self, mm_cfg):
+        # Initialize vision encoder and freeze it
+        if mm_cfg.vision_encoder.get("from_hf", False):
+            if "clip" in mm_cfg.vision_encoder.from_pretrained:
+                vision_encoder = CLIPVisionModel.from_pretrained(
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
+                ).cuda()
+                vision_encoder = vision_encoder.to(torch.bfloat16)
+                if mm_cfg.vision_encoder.freeze:
+                    for param in vision_encoder.parameters():
+                        param.requires_grad = False
+                    vision_encoder = vision_encoder.eval()
+            elif "siglip" in mm_cfg.vision_encoder.from_pretrained:
+                vision_encoder = SiglipVisionModel.from_pretrained(
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
+                ).cuda()
+                vision_encoder = vision_encoder.to(torch.bfloat16)
+                if mm_cfg.vision_encoder.freeze:
+                    for param in vision_encoder.parameters():
+                        param.requires_grad = False
+                    vision_encoder = vision_encoder.eval()
+            else:
+                raise (ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface"))
+        else:
+            vision_cfg = MegatronCLIPModel.restore_from(
+                mm_cfg.vision_encoder.from_pretrained, return_config=True
+            ).vision
+            vision_encoder = FrozenCLIPVisionTransformer(vision_cfg, self.config)
+            self.load_vision_encoder_weights(vision_encoder, mm_cfg.vision_encoder.from_pretrained)
+            if mm_cfg.vision_encoder.freeze:
+                vision_encoder.freeze()
+
+        image_processor = create_image_processor(mm_cfg)
+
+        return vision_encoder, image_processor
+
     def freeze_llm(self, mm_cfg):
         raise NotImplementedError
 
@@ -379,20 +425,41 @@ class MCoreNevaModel(MCoreGPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         MCoreGPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
 
     def freeze_llm(self, mm_cfg):
-        for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            embedding_parameters = self.embedding.parameters()
+        else:
+            embedding_parameters = {}
+        if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            output_layer_parameters = self.output_layer.parameters()
+        else:
+            output_layer_parameters = {}
+
+        for param in chain(
+            embedding_parameters,
+            self.decoder.parameters(),
+            output_layer_parameters,
+        ):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return MCoreGPTModel.forward(self, *args, **kwargs)
 
 
@@ -405,7 +472,12 @@ class NevaModel(GPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         GPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
@@ -415,10 +487,13 @@ def freeze_llm(self, mm_cfg):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return GPTModel.forward(self, *args, **kwargs)
 
 
@@ -438,7 +513,7 @@ def init_neva_adapter(self):
             adapter_type=self.cfg.mm_cfg.get("mm_mlp_adapter_type", "linear"),
             in_features=self.cfg.mm_cfg.vision_encoder.hidden_size,
             out_features=self.cfg.hidden_size,
-            bias=True,
+            bias=True,  # self.cfg.get("bias", False),
         )
         for name, module in self.named_modules():
             self._check_and_add_adapter(
@@ -454,8 +529,10 @@ def init_neva_adapter(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN)
-        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN)
+
+        model_type = self.cfg.mm_cfg.llm.get("model_type", "nvgpt")
+        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN[model_type])
+        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN[model_type])
 
         if self.mcore_gpt:
             if not parallel_state.is_initialized():
@@ -564,6 +641,13 @@ def setup_optimizer_param_groups(self):
         else:
             MegatronGPTModel.setup_optimizer_param_groups(self)
 
+        # TODO(yuya): Refactor the handling of distributed checkpoint optimizer state loading
+        # With Pipeline Parallelism (PP) greater than 1, different stages might have varying lengths for `self._optimizer_param_groups`.
+        # This inconsistency can lead to errors during the loading of distributed checkpoints.
+        # As a temporary workaround, if `self._optimizer_param_groups` has less than 2 groups, add an empty parameter group marked as non-expert.
+        if len(self._optimizer_param_groups) < 2 and not self.use_peft:
+            self._optimizer_param_groups = (self._optimizer_param_groups[0], {'params': [], 'is_expert': False})
+
         # filter out params doesn't have grad
         for param_group in self._optimizer_param_groups:
             params_with_grad = [param for param in param_group['params'] if param.requires_grad]
@@ -611,13 +695,82 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         return output_tensor
 
     def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        else:
+            batch, _, _ = next(dataloader_iter)
+            _, seq_length = batch['tokens'].shape
+            batch_iter = get_iterator_k_split(batch, get_num_microbatches())
+
+            # handle asynchronous grad reduction
+            no_sync_func = None
+            grad_sync_func = None
+            param_sync_func = None
+            if not forward_only and self.with_distributed_adam:
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
+                grad_sync_func = self.reduce_overlap_gradients
+                param_sync_func = self.sync_overlap_parameters
+
+            # pipeline schedules will get these from self.model.config
+            for module in self.get_model_module_list():
+                module.config.no_sync_func = no_sync_func
+                module.config.grad_sync_func = grad_sync_func
+                module.config.param_sync_func = param_sync_func
+
+            # run forward and backwards passes for an entire global batch
+            # we do this inside training_step to support pipeline parallelism
+            fwd_bwd_function = get_forward_backward_func()
+            # print(f"{torch.distributed.get_rank()}: {parallel_state.is_pipeline_last_stage()} {fwd_bwd_function}")
+
+            # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready
+            losses_reduced_per_micro_batch = fwd_bwd_function(
+                forward_step_func=self.get_forward_output_and_loss_func(forward_only),
+                data_iterator=self._make_data_iterator_list(batch_iter),
+                model=self.model,
+                num_microbatches=get_num_microbatches(),
+                forward_only=forward_only,
+                seq_length=seq_length,
+                micro_batch_size=self.cfg.micro_batch_size,
+                first_val_step=first_val_step,
+            )
+
+            # only the last stages of the pipeline return losses
+            if losses_reduced_per_micro_batch:
+                if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                    # average loss across micro batches
+                    loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                    loss_tensor = torch.concat(loss_tensors_list)
+                    loss_mean = loss_tensor.mean()
+                else:
+                    # Get the total loss since micro batches sizes are not uniform
+                    loss_sum_tensors_list = [
+                        loss_sum['loss_sum_and_ub_size']
+                        for loss_sum in losses_reduced_per_micro_batch
+                        if loss_sum['loss_sum_and_ub_size'][1] > 0
+                    ]
+                    loss_sum = (
+                        torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                        if len(loss_sum_tensors_list) > 0
+                        else torch.tensor([0.0, 0.0]).cuda()
+                    )
+                    return loss_sum
+            else:
+                # we're not on the last pipeline stage so no losses
+                if forward_only:
+                    loss_mean = []
+                else:
+                    loss_mean = torch.tensor(0.0).cuda()
+
+            return loss_mean
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         return MegatronGPTModel.training_step(self, dataloader_iter)
 
@@ -631,7 +784,9 @@ def loss_func(output_tensor, loss_mask):
                 return loss_for_ub, dict(avg=reduced_loss[0].unsqueeze(0))
 
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 for k in batch.keys():
                     if self.get_attention_mask_from_fusion:
@@ -644,28 +799,36 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
                             batch[k] = (
-                                batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids', 'media'] else None
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['tokens', 'position_ids', 'media', 'cu_seqlens']
+                                else None
                             )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['tokens', 'position_ids', 'attention_mask', 'media']
+                                if k in ['tokens', 'position_ids', 'attention_mask', 'media', 'cu_seqlens']
                                 else None
                             )
                 elif parallel_state.is_pipeline_last_stage():
                     # Last pipeline stage needs the labels, loss_mask, and attention_mask
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
-                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None
+                            batch[k] = (
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['labels', 'loss_mask', 'cu_seqlens']
+                                else None
+                            )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['labels', 'loss_mask', 'attention_mask']
+                                if k in ['labels', 'loss_mask', 'attention_mask', 'cu_seqlens']
                                 else None
                             )
                 else:
                     # Intermediate pipeline stage doesn't need any inputs
-                    batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media']}
+                    batch = {
+                        k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media', 'loss_mask']
+                    }
 
             forward_args = {
                 'input_ids': batch['tokens'],
@@ -678,16 +841,40 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                 if self.use_loss_mask:
                     forward_args['loss_mask'] = batch['loss_mask']
                 forward_args['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+            else:
+                if 'cu_seqlens' in batch:  # packed sequence
+                    # these args are passed eventually into TEDotProductAttention.forward()
+                    cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+                    max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+
+                    try:
+                        from megatron.core.packed_seq_params import PackedSeqParams
+                    except (ImportError, ModuleNotFoundError) as e:
+                        mcore_version = packaging.version.Version(version('megatron-core'))
+                        logging.error(
+                            f"megatron-core v{mcore_version} does not support training with packed sequence. "
+                            "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+                        )
+                        raise e
+                    forward_args['packed_seq_params'] = PackedSeqParams(
+                        cu_seqlens_q=cu_seqlens,
+                        cu_seqlens_kv=cu_seqlens,
+                        max_seqlen_q=max_seqlen,
+                        max_seqlen_kv=max_seqlen,
+                        qkv_format='thd',
+                    )
 
             output_tensor = model(**forward_args)
 
-            return output_tensor, partial(loss_func, loss_mask=batch['loss_mask'])
+            return output_tensor, partial(loss_func, loss_mask=batch.get('loss_mask'))
 
         return fwd_output_and_loss_func
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             extra_arg = {}
             (
                 tokens,
@@ -786,7 +973,7 @@ def loss_func(self, loss_mask, output_tensor):
         return loss
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -859,10 +1046,24 @@ def setup(self, stage=None):
 
     def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
-        ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
-        self._train_ds = ds_dict["train_dataset"]
-        self._validation_ds = ds_dict["eval_dataset"]
-
+        if self.cfg.data.get("packed_sequence", False):
+            assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+            self._train_ds = NevaPackedSeqDatatset(
+                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+            )
+            self._validation_ds = NevaPackedSeqDatatset(
+                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+            )
+        else:
+            ds_dict = make_supervised_data_module(
+                tokenizer=self.tokenizer,
+                image_processor=(
+                    self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor
+                ),
+                model_cfg=self.cfg,
+            )
+            self._train_ds = ds_dict["train_dataset"]
+            self._validation_ds = ds_dict["eval_dataset"]
         return self._train_ds, self._validation_ds
 
     def build_pretraining_data_loader(
@@ -872,12 +1073,17 @@ def build_pretraining_data_loader(
 
         logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
         # Megatron sampler
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            micro_batch_size = self.cfg.micro_batch_size
+        else:
+            micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+
         if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
             if self.cfg.data.dataloader_type == 'single':
                 batch_sampler = MegatronPretrainingSampler(
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=drop_last,
@@ -889,7 +1095,7 @@ def build_pretraining_data_loader(
                     dataset=dataset,
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=self.cfg.get('drop_last', True),
@@ -922,10 +1128,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
     def setup_test_data(self, cfg):
         pass
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        # Get the original state dictionary
-        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-
+    def get_keys_to_keep(self):
         keys_to_keep = list(self.adapter_keys)
         # TODO(yuya): maybe not hard-code vision_encoder keys here
         vision_encoder_keys = [k for k in self.base_keys if "vision_encoder" in k]
@@ -934,6 +1137,12 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             keys_to_keep += llm_keys
         if not self.cfg.mm_cfg.vision_encoder.freeze:
             keys_to_keep += vision_encoder_keys
+        return keys_to_keep
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # Get the original state dictionary
+        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        keys_to_keep = self.get_keys_to_keep()
         new_state_dict = {k: original_state_dict[k] for k in keys_to_keep}
         return new_state_dict
 
@@ -952,15 +1161,46 @@ def load_state_dict(self, state_dict, strict=False):
             logging.critical(f'Unexpected keys: \n{unexpected_keys}')
 
     def on_load_checkpoint(self, checkpoint) -> None:
-        pass
-        # if self.mcore_gpt:
-        #     state_dict = checkpoint["state_dict"]
-        #     self.load_state_dict(state_dict)
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+
+        # mcore uses distributed checkpointing
+        # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing
+        if self.mcore_gpt and not self.use_fsdp:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+                    module.load_state_dict(checkpoint_state_dict, strict=False)
+            else:
+                # when restoring a distributed checkpoint from a ptl checkpoint we need to defer loading the state_dict
+                # see NLPModel.on_load_checkpoint
+                checkpoint['state_dict'] = {}
+
+        # legacy checkpointing for interleaved
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def sharded_state_dict(self, prefix: str = ''):
-        return None
-        # sharded_state_dict = MegatronGPTModel.sharded_state_dict(self, prefix)
-        # return sharded_state_dict
+        if self.use_peft:
+            return None
+
+        original_sharded_state_dict = super().sharded_state_dict()
+        keys_to_keep = self.get_keys_to_keep()
+        new_sharded_state_dict = {k: original_sharded_state_dict[k] for k in keys_to_keep}
+        return new_sharded_state_dict
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self.get_inference_config()
@@ -989,7 +1229,11 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
                 return generate(self, **inference_config)
 
     def generate(
-        self, input_prompts, inference_config, length_params: LengthParam, sampling_params: SamplingParam = None,
+        self,
+        input_prompts,
+        inference_config,
+        length_params: LengthParam,
+        sampling_params: SamplingParam = None,
     ) -> OutputType:
 
         # check whether the DDP is initialized
diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 317cdf5d6364..0b830ac7319b 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -20,7 +20,6 @@
 from torch._inductor import config as inductor_config
 
 from nemo.collections.multimodal.data.dreambooth.dreambooth_dataset import DreamBoothDataset
-from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
 from nemo.collections.multimodal.modules.stable_diffusion.distributions.distributions import (
     DiagonalGaussianDistribution,
 )
@@ -647,6 +646,8 @@ def load_from_checkpoint(
         return checkpoint
 
     def _check_and_add_adapter(self, name, module, peft_name, peft_cfg, name_key_to_mcore_mixins=None):
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
+
         if isinstance(module, AdapterModuleMixin):
             if isinstance(module, LinearWrapper):
                 peft_cfg.in_features, peft_cfg.out_features = module.in_features, module.out_features
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 7023f57652b5..6ea4314ab71f 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1674,7 +1674,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # megatron_amp_O2 is not yet supported in diffusion models
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
-        if self.cfg.precision in ['16', 16, 'bf16']:
+        if self.megatron_amp_O2 and self.cfg.precision in ['16', 16, 'bf16']:
             self.model_parallel_config.enable_autocast = False
             if not hasattr(self.cfg.unet_config, 'unet_precision') or not '16' in str(
                 self.cfg.unet_config.unet_precision
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index fe35ae148026..7be7407b98ae 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -358,12 +358,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.transformer_engine = cfg.get('transformer_engine', False)
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
diff --git a/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py b/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
index 72e70250f0d7..f5beca436ecf 100644
--- a/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
+++ b/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
@@ -43,7 +43,14 @@
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
-from apex.contrib.group_norm import GroupNorm
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
 
 
 def conv_nd(dims, *args, **kwargs):
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index c92980d904f6..2eeed97db781 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
 from inspect import isfunction
 
 import torch
 import torch.nn.functional as F
-from apex.contrib.group_norm import GroupNorm
 from einops import rearrange, repeat
 from torch import einsum, nn
 from torch._dynamo import disable
 
+if os.environ.get("USE_NATIVE_GROUP_NORM", "0") == "1":
+    from nemo.gn_native import GroupNormNormlization as GroupNorm
+else:
+    try:
+        from apex.contrib.group_norm import GroupNorm
+
+        OPT_GROUP_NORM = True
+    except Exception:
+        print('Fused optimized group norm has not been installed.')
+        OPT_GROUP_NORM = False
+
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import checkpoint
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
@@ -29,6 +40,14 @@
 from nemo.core import adapter_mixins
 from nemo.utils import logging
 
+try:
+    from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
 
 def check_cuda():
     if not torch.cuda.is_available():
@@ -48,7 +67,6 @@ def check_cuda():
     from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
 
     flash_attn_installed = check_cuda()
-    print("FlashAttention Installed")
 
     # Disable TorchDynamo on FlashAttention
     FlashSelfAttention.forward = disable(FlashSelfAttention.forward)
@@ -96,13 +114,23 @@ def forward(self, x):
 
 
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=False):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
 
-        self.net = nn.Sequential(project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
+        if use_te:
+            activation = 'gelu' if not glu else 'geglu'
+            # TODO: more parameters to be confirmed, dropout, seq_length
+            self.net = LayerNormMLP(
+                hidden_size=dim,
+                ffn_hidden_size=inner_dim,
+                activation=activation,
+            )
+        else:
+            norm = nn.LayerNorm(dim)
+            project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+            self.net = nn.Sequential(norm, project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
 
     def forward(self, x):
         return self.net(x)
@@ -225,23 +253,37 @@ def __init__(
         dropout=0.0,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
 
         self.inner_dim = dim_head * heads
+        if context_dim is None:
+            self.is_self_attn = True
+        else:
+            self.is_self_attn = False  # cross-attention
         context_dim = default(context_dim, query_dim)
         # make attention part be aware of self-attention/cross-attention
         self.context_dim = context_dim
         self.query_dim = query_dim
         self.dim_head = dim_head
 
-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
         self.heads = heads
 
-        self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_v = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
 
+        self.use_te = use_te
+        if use_te:
+            return_layernorm_output = True if self.is_self_attn else False
+            self.norm_to_q = LayerNormLinear(
+                query_dim, self.inner_dim, bias=False, return_layernorm_output=return_layernorm_output
+            )
+        else:
+            self.norm = nn.LayerNorm(query_dim)
+            self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False)
+
         self.to_out = nn.Sequential(
             LinearWrapper(self.inner_dim, query_dim, lora_network_alpha=lora_network_alpha), nn.Dropout(dropout)
         )
@@ -262,8 +304,18 @@ def forward(self, x, context=None, mask=None, additional_tokens=None, n_times_cr
             # add additional token
             x = torch.cat([additional_tokens, x], dim=1)
 
-        q = self.to_q(x)
-        context = default(context, x)
+        if self.use_te:
+            q_out = self.norm_to_q(x)
+            if self.is_self_attn:
+                q, ln_out = q_out
+                context = default(context, ln_out)
+            else:
+                q = q_out
+                context = default(context, x)
+        else:
+            x = self.norm(x)
+            q = self.to_q(x)
+            context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
 
@@ -351,6 +403,7 @@ def __init__(
         use_flash_attention=False,
         disable_self_attn=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         self.disable_self_attn = disable_self_attn
@@ -362,8 +415,9 @@ def __init__(
             use_flash_attention=use_flash_attention,
             context_dim=context_dim if self.disable_self_attn else None,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, use_te=use_te)
         self.attn2 = CrossAttention(
             query_dim=dim,
             context_dim=context_dim,
@@ -372,10 +426,8 @@ def __init__(
             dropout=dropout,
             use_flash_attention=use_flash_attention,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
         self.use_checkpoint = use_checkpoint
 
     def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
@@ -397,15 +449,15 @@ def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_at
     def _forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
         x = (
             self.attn1(
-                self.norm1(x),
+                x,
                 context=context if self.disable_self_attn else None,
                 additional_tokens=additional_tokens,
                 n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self if not self.disable_self_attn else 0,
             )
             + x
         )
-        x = self.attn2(self.norm2(x), context=context, additional_tokens=additional_tokens) + x
-        x = self.ff(self.norm3(x)) + x
+        x = self.attn2(x, context=context, additional_tokens=additional_tokens) + x
+        x = self.ff(x) + x
         return x
 
 
@@ -431,6 +483,7 @@ def __init__(
         use_checkpoint=False,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         logging.info(
@@ -473,6 +526,7 @@ def __init__(
                     use_flash_attention=use_flash_attention,
                     disable_self_attn=disable_self_attn,
                     lora_network_alpha=lora_network_alpha,
+                    use_te=use_te,
                 )
                 for d in range(depth)
             ]
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
index 7fc5c208004f..5b874f5f10ad 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
@@ -17,12 +17,19 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from apex.contrib.group_norm import GroupNorm
 from einops import rearrange
 
 from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearAttention
 from nemo.collections.multimodal.parts.stable_diffusion.utils import instantiate_from_config
 
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def get_timestep_embedding(timesteps, embedding_dim):
     """
@@ -226,7 +233,10 @@ def __init__(
             # timestep embedding
             self.temb = nn.Module()
             self.temb.dense = nn.ModuleList(
-                [torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch),]
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
             )
 
         # downsampling
@@ -662,7 +672,11 @@ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
             ]
         )
 
-        self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1,)
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
 
     def forward(self, x):
         x = self.conv_in(x)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 5ff0f6aa8a8a..30ff0e1a9ff3 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
+import re
 from abc import abstractmethod
 from collections.abc import Iterable
+from contextlib import nullcontext
 from functools import partial
 from typing import Iterable
 
@@ -22,7 +25,6 @@
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
-from apex.contrib.group_norm import GroupNorm
 
 from nemo.collections.multimodal.modules.stable_diffusion.attention import SpatialTransformer
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import (
@@ -39,6 +41,23 @@
 )
 from nemo.utils import logging
 
+try:
+    # FP8 related import
+    import transformer_engine
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def convert_module_to_dtype(module, dtype, enable_norm_layers=False):
     # Convert module parameters to dtype
@@ -62,16 +81,48 @@ def convert_module_to_fp32(module, enable_norm_layers=False):
     convert_module_to_dtype(module, torch.float32, enable_norm_layers)
 
 
+def convert_module_to_fp8(model):
+    def _set_module(model, submodule_key, module):
+        tokens = submodule_key.split('.')
+        sub_tokens = tokens[:-1]
+        cur_mod = model
+        for s in sub_tokens:
+            cur_mod = getattr(cur_mod, s)
+        setattr(cur_mod, tokens[-1], module)
+
+    import copy
+
+    from transformer_engine.pytorch.module import Linear as te_Linear
+
+    for n, v in model.named_modules():
+        if isinstance(v, torch.nn.Linear):
+            # if n in ['class_embed', 'bbox_embed.layers.0', 'bbox_embed.layers.1', 'bbox_embed.layers.2']: continue
+            logging.info(f'[INFO] Replace Linear: {n}, weight: {v.weight.shape}')
+            if v.bias is None:
+                is_bias = False
+            else:
+                is_bias = True
+            newlinear = te_Linear(v.in_features, v.out_features, bias=is_bias)
+            newlinear.weight = copy.deepcopy(v.weight)
+            if v.bias is not None:
+                newlinear.bias = copy.deepcopy(v.bias)
+            _set_module(model, n, newlinear)
+
+
 class AttentionPool2d(nn.Module):
     """
     Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
     """
 
     def __init__(
-        self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None,
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
     ):
         super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
         self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
         self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
         self.num_heads = embed_dim // num_heads_channels
@@ -285,7 +336,10 @@ def __init__(
             self.emb_layers = None
             self.exchange_temb_dims = False
         else:
-            self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),)
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(emb_channels, self.emb_out_channels),
+            )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups),
             nn.Dropout(p=dropout),
@@ -353,7 +407,12 @@ class AttentionBlock(nn.Module):
     """
 
     def __init__(
-        self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
     ):
         super().__init__()
         self.channels = channels
@@ -404,7 +463,7 @@ def count_flops_attn(model, _x, y):
     # We perform two matmuls with the same number of ops.
     # The first computes the weight matrix, the second computes
     # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    matmul_ops = 2 * b * (num_spatial**2) * c
     model.total_ops += th.DoubleTensor([matmul_ops])
 
 
@@ -553,6 +612,7 @@ def __init__(
         unet_precision: str = "fp32",
         lora_network_alpha=None,
         timesteps=1000,
+        use_te_fp8: bool = False,
     ):
         super().__init__()
         from omegaconf.listconfig import ListConfig
@@ -605,7 +665,10 @@ def __init__(
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
             assert all(
-                map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),)
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
             )
             logging.info(
                 f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
@@ -626,7 +689,9 @@ def __init__(
         self.predict_codebook_ids = n_embed is not None
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps))
@@ -643,7 +708,9 @@ def __init__(
                 self.label_emb = nn.Sequential(
                     Timestep(model_channels),
                     nn.Sequential(
-                        linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     ),
                 )
             elif self.num_classes == "sequential":
@@ -651,7 +718,9 @@ def __init__(
                 self.adm_in_channels = adm_in_channels
                 self.label_emb = nn.Sequential(
                     nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     )
                 )
             else:
@@ -663,6 +732,7 @@ def __init__(
         input_block_chans = [model_channels]
         ch = model_channels
         ds = 1
+        self.use_te_fp8 = use_te_fp8
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
@@ -713,6 +783,7 @@ def __init__(
                                 use_checkpoint=use_checkpoint,
                                 use_flash_attention=use_flash_attention,
                                 lora_network_alpha=lora_network_alpha,
+                                use_te=self.use_te_fp8,
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
@@ -760,25 +831,28 @@ def __init__(
                 use_scale_shift_norm=use_scale_shift_norm,
                 resblock_gn_groups=resblock_gn_groups,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer(
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth_middle,
-                context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint,
-                use_flash_attention=use_flash_attention,
-                lora_network_alpha=lora_network_alpha,
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    use_checkpoint=use_checkpoint,
+                    use_flash_attention=use_flash_attention,
+                    use_te=self.use_te_fp8,
+                    lora_network_alpha=lora_network_alpha,
+                )
             ),
             ResBlock(
                 ch,
@@ -844,6 +918,7 @@ def __init__(
                                 use_checkpoint=use_checkpoint,
                                 use_flash_attention=use_flash_attention,
                                 lora_network_alpha=lora_network_alpha,
+                                use_te=self.use_te_fp8,
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
@@ -899,6 +974,34 @@ def __init__(
             self.convert_to_fp16()
         elif unet_precision == 'fp16':
             self.convert_to_fp16(enable_norm_layers=True)
+        elif self.use_te_fp8:
+            assert unet_precision != 'fp16', "fp8 training can't work with fp16 O2 amp recipe"
+            convert_module_to_fp8(self)
+
+            fp8_margin = int(os.getenv("FP8_MARGIN", '0'))
+            fp8_interval = int(os.getenv("FP8_INTERVAL", '1'))
+            fp8_format = os.getenv("FP8_FORMAT", "hybrid")
+            fp8_amax_history_len = int(os.getenv("FP8_HISTORY_LEN", '1024'))
+            fp8_amax_compute_algo = os.getenv("FP8_COMPUTE_ALGO", 'max')
+            fp8_wgrad = os.getenv("FP8_WGRAD", '1') == '1'
+
+            fp8_format_dict = {
+                'hybrid': transformer_engine.common.recipe.Format.HYBRID,
+                'e4m3': transformer_engine.common.recipe.Format.E4M3,
+            }
+            fp8_format = fp8_format_dict[fp8_format]
+
+            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=fp8_margin,
+                interval=fp8_interval,
+                fp8_format=fp8_format,
+                amax_history_len=fp8_amax_history_len,
+                amax_compute_algo=fp8_amax_compute_algo,
+                override_linear_precision=(False, False, not fp8_wgrad),
+            )
+            old_state_dict = self.state_dict()
+            new_state_dict = self.te_fp8_key_mapping(old_state_dict)
+            self.load_state_dict(new_state_dict, strict=False)
 
         self.unet_precision = unet_precision
 
@@ -1000,8 +1103,71 @@ def _sdxl_embedding_mapping(self, sdxl_dict):
             res_dict[new_key_] = value_
         return res_dict
 
+    def _legacy_unet_ckpt_mapping(self, unet_dict):
+        new_dict = {}
+        key_map = {
+            'transformer_blocks.0.norm1.weight': 'transformer_blocks.0.attn1.norm.weight',
+            'transformer_blocks.0.norm1.bias': 'transformer_blocks.0.attn1.norm.bias',
+            'transformer_blocks.0.norm2.weight': 'transformer_blocks.0.attn2.norm.weight',
+            'transformer_blocks.0.norm2.bias': 'transformer_blocks.0.attn2.norm.bias',
+            'transformer_blocks.0.norm3.weight': 'transformer_blocks.0.ff.net.0.weight',
+            'transformer_blocks.0.norm3.bias': 'transformer_blocks.0.ff.net.0.bias',
+            'transformer_blocks.0.ff.net.0.proj.weight': 'transformer_blocks.0.ff.net.1.proj.weight',
+            'transformer_blocks.0.ff.net.0.proj.bias': 'transformer_blocks.0.ff.net.1.proj.bias',
+            'transformer_blocks.0.ff.net.2.weight': 'transformer_blocks.0.ff.net.3.weight',
+            'transformer_blocks.0.ff.net.2.bias': 'transformer_blocks.0.ff.net.3.bias',
+        }
+
+        pattern = re.compile(r'(input_blocks|output_blocks)\.[\d\w]+\.[\d\w]+\.')
+        pattern_middle_block = re.compile(r'middle_block\.[\d\w]+\.')
+        for old_key, value in unet_dict.items():
+            match = pattern.match(old_key)
+            match_middle = pattern_middle_block.match(old_key)
+            if match or match_middle:
+                prefix = match.group(0) if match else match_middle.group(0)
+                suffix = old_key.split('.', 3)[-1] if match else old_key.split('.', 2)[-1]
+                if suffix in key_map:
+                    new_key = prefix + key_map[suffix]
+                    new_dict[new_key] = value
+                else:
+                    new_dict[old_key] = value
+            else:
+                new_dict[old_key] = value
+
+        return new_dict
+
+    def te_fp8_key_mapping(self, unet_dict):
+        new_state_dict = {}
+        for key in unet_dict.keys():
+            if 'extra_state' in key:
+                continue
+
+            ### LayerNormLinear
+            # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
+            # norm_to_q.weight -> to_q.weight
+            new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_')
+            new_key = new_key.replace(
+                'attn1.to_q.weight',
+                'attn1.norm_to_q.weight',
+            )
+            new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_')
+            new_key = new_key.replace(
+                'attn2.to_q.weight',
+                'attn2.norm_to_q.weight',
+            )
+
+            ### LayerNormMLP
+            # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
+            # ff.net.fc1_{weight|bias} -> ff.net.1.proj.{weight|bias}
+            # ff.net.fc2_{weight|bias} -> ff.net.3.{weight|bias}
+            new_key = new_key.replace('ff.net.0.', 'ff.net.layer_norm_')
+            new_key = new_key.replace('ff.net.1.proj.', 'ff.net.fc1_')
+            new_key = new_key.replace('ff.net.3.', 'ff.net.fc2_')
+
+            new_state_dict[new_key] = unet_dict[key]
+        return new_state_dict
+
     def _state_key_mapping(self, state_dict: dict):
-        import re
 
         res_dict = {}
         input_dict = {}
@@ -1027,13 +1193,7 @@ def _state_key_mapping(self, state_dict: dict):
         mid_dict = self._mid_blocks_mapping(mid_dict)
         other_dict = self._other_blocks_mapping(other_dict)
         sdxl_dict = self._sdxl_embedding_mapping(sdxl_dict)
-        # key_list = state_dict.keys()
-        # key_str = " ".join(key_list)
 
-        # for key_, val_ in state_dict.items():
-        #     key_ = key_.replace("down_blocks", "input_blocks")\
-        #         .replace("up_blocks", 'output_blocks')
-        #     res_dict[key_] = val_
         res_dict.update(input_dict)
         res_dict.update(output_dict)
         res_dict.update(mid_dict)
@@ -1046,6 +1206,7 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         state_dict = self._strip_unet_key_prefix(state_dict)
         if not from_NeMo:
             state_dict = self._state_key_mapping(state_dict)
+        state_dict = self._legacy_unet_ckpt_mapping(state_dict)
 
         model_state_dict = self.state_dict()
         loaded_keys = [k for k in state_dict.keys()]
@@ -1082,7 +1243,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         def _find_mismatched_keys(
-            state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes,
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
         ):
             mismatched_keys = []
             if ignore_mismatched_sizes:
@@ -1102,7 +1266,10 @@ def _find_mismatched_keys(
         if state_dict is not None:
             # Whole checkpoint
             mismatched_keys = _find_mismatched_keys(
-                state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes,
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
             )
             error_msgs = self._load_state_dict_into_model(state_dict)
         return missing_keys, unexpected_keys, mismatched_keys, error_msgs
@@ -1151,7 +1318,7 @@ def convert_to_fp16(self, enable_norm_layers=False):
         """
         self.apply(lambda module: convert_module_to_fp16(module=module, enable_norm_layers=enable_norm_layers))
 
-    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+    def _forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         """
         Apply the model to an input batch.
 
@@ -1170,7 +1337,6 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             self.num_classes is not None
         ), "must specify y if and only if the model is class-conditional"
         hs = []
-
         if self.unet_precision == "fp16-mixed" or self.unet_precision == "fp16":
             x = x.type(torch.float16)
             if context is not None:
@@ -1197,6 +1363,18 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         else:
             return self.out(h)
 
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        with (
+            transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_te_fp8,
+                fp8_recipe=self.fp8_recipe,
+            )
+            if self.use_te_fp8
+            else nullcontext()
+        ):
+            out = self._forward(x, timesteps, context, y, **kwargs)
+        return out
+
 
 class EncoderUNetModel(nn.Module):
     """
@@ -1249,7 +1427,9 @@ def __init__(
 
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.input_blocks = nn.ModuleList(
@@ -1351,11 +1531,15 @@ def __init__(
         elif pool == "attention":
             assert num_head_channels != -1
             self.out = nn.Sequential(
-                normalization(ch), nn.SiLU(), AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
             )
         elif pool == "spatial":
             self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels),
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
             )
         elif pool == "spatial_v2":
             self.out = nn.Sequential(
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 3b446f4a42c3..69700a43614e 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -29,15 +29,22 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from apex.contrib.group_norm import GroupNorm
 from einops import repeat
 from torch._dynamo import disable
 from torch.cuda.amp import custom_bwd, custom_fwd
 
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
     if schedule == "linear":
-        betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2
 
     elif schedule == "cosine":
         timesteps = torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
@@ -162,7 +169,10 @@ def backward(ctx, *output_grads):
             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
             output_tensors = ctx.run_function(*shallow_copies)
         input_grads = torch.autograd.grad(
-            output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True,
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
         )
         del ctx.input_tensors
         del ctx.input_params
@@ -312,7 +322,11 @@ def interpolate_fn(x, xp, yp):
     start_idx = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(1, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
     start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
@@ -320,7 +334,11 @@ def interpolate_fn(x, xp, yp):
     start_idx2 = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(0, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
     start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
index 446b81ab11b6..bff579bbca4f 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
@@ -718,7 +718,7 @@ def forward(self, text):
 
     def encode_with_transformer(self, text):
         x = self.model.language_model.embedding.word_embeddings(text)
-        x += self.model.language_model.embedding.position_embeddings
+        x = x + self.model.language_model.embedding.position_embeddings
         x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
         x = self.model.language_model.encoder.final_layernorm(x)
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 723e965eb8a8..9ad8856daa63 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -15,16 +15,21 @@
 import tempfile
 from typing import Any, Callable, Tuple
 
+import decord
+import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from PIL import Image
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
 
+from nemo.collections.multimodal.data.neva.neva_dataset import process_image
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import AppState, logging
 from nemo.utils.model_utils import inject_model_parallel_rank
 
@@ -136,7 +141,8 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
                 checkpoint = dist_checkpointing.load(
-                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir,
+                    sharded_state_dict=checkpoint,
+                    checkpoint_dir=tmp_model_weights_dir,
                 )
                 state_dict = checkpoint["state_dict"]
 
@@ -147,7 +153,9 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
 
 
 def setup_trainer_and_models_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ):
     """
     Set up a trainer and NeMo model for inference.
@@ -170,7 +178,10 @@ def setup_trainer_and_models_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -178,7 +189,6 @@ def setup_trainer_and_models_for_inference(
     # Create the NLPSaveRestoreConnector object for model saving and restoring.
     save_restore_connector = NLPSaveRestoreConnector()
 
-    print(f'Loading {cfg.models} models')
     models = []
     for single_model_cfg in cfg.models:
         if not single_model_cfg.restore_from_path:
@@ -214,7 +224,9 @@ def setup_trainer_and_models_for_inference(
             )
 
             model = model_provider.load_from_checkpoint(
-                single_model_cfg.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+                single_model_cfg.restore_from_path,
+                hparams_file=cfg.model.get("hparams_file"),
+                trainer=trainer,
             )
             models.append(model)
 
@@ -238,7 +250,9 @@ def dummy():
 
 
 def setup_trainer_and_model_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ) -> Tuple[Trainer, Any]:
     """
     Set up a trainer and NeMo model for inference.
@@ -260,7 +274,10 @@ def setup_trainer_and_model_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -298,7 +315,9 @@ def setup_trainer_and_model_for_inference(
         )
 
         model = model_provider.load_from_checkpoint(
-            cfg.model.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+            cfg.model.restore_from_path,
+            hparams_file=cfg.model.get("hparams_file"),
+            trainer=trainer,
         )
 
     else:
@@ -320,7 +339,7 @@ def dummy():
 
 
 def create_neva_model_and_processor(cfg):
-    from nemo.collections.multimodal.models.neva.neva_model import MegatronNevaModel
+    from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
 
     plugins = []
     if cfg.get('cluster_type', None) == 'BCP':
@@ -328,20 +347,6 @@ def create_neva_model_and_processor(cfg):
     # trainer required for restoring model parallel models
     trainer = Trainer(plugins=plugins, strategy=NLPDDPStrategy(), **cfg.trainer)
 
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
-        model_config = MegatronNevaModel.restore_from(
-            restore_path=cfg.neva_model_file, trainer=trainer, return_config=True,
-        )
-
-        with open_dict(cfg):
-            cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
-
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
@@ -366,6 +371,9 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
             neva_cfg.apply_rope_fusion = False
+            neva_cfg.fp8 = False
+            neva_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            neva_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(
@@ -388,6 +396,7 @@ def create_neva_model_and_processor(cfg):
             (
                 app_state.tensor_model_parallel_rank,
                 app_state.pipeline_model_parallel_rank,
+                app_state.expert_model_parallel_rank,
                 app_state.model_parallel_size,
                 app_state.data_parallel_size,
                 app_state.pipeline_model_parallel_split_rank,
@@ -423,6 +432,44 @@ def image_processor(maybe_image_path):
         else:
             image = maybe_image_path
 
+        processor = (
+            model.model.module.image_processor if hasattr(model.model, "module") else model.model.image_processor
+        )
+        image = process_image(processor, image, neva_cfg.data.image_aspect_ratio)
+        if neva_cfg.precision in [16, '16', '16-mixed']:
+            media = image.type(torch.float16)
+        elif neva_cfg.precision in [32, '32', '32-true']:
+            media = image.type(torch.float32)
+        else:
+            media = image.type(torch.bfloat16)
+
+        return media.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0)
+
+    # add video processor for video neva
+    def video_processor(maybe_video_path):
+
+        if isinstance(maybe_video_path, str):
+            decord.bridge.set_bridge("torch")
+            vr = decord.VideoReader(maybe_video_path)
+            if neva_cfg.data.splice_single_frame == 'first':
+                frames = [Image.fromarray(vr[0].asnumpy()).convert('RGB')]
+            elif neva_cfg.data.splice_single_frame == 'middle':
+                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()).convert('RGB')]
+            elif neva_cfg.data.splice_single_frame == 'last':
+                frames = [Image.fromarray(vr[-1].asnumpy()).convert('RGB')]
+            else:
+                if neva_cfg.data.num_frames == -1:
+                    frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr]
+                else:
+                    num_frames = min(len(vr), neva_cfg.data.num_frames)
+                    indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int)
+                    frames = vr.get_batch(indices)
+
+                    while len(frames) < neva_cfg.data.num_frames:
+                        frames.append(frames[-1])
+        else:
+            frames = maybe_video_path
+
         if neva_cfg.mm_cfg.vision_encoder.from_hf:
             processor = CLIPImageProcessor.from_pretrained(
                 neva_cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
@@ -430,14 +477,15 @@ def image_processor(maybe_image_path):
         else:
             processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
 
+        # support single video inference
         if neva_cfg.data.image_aspect_ratio == 'keep':
-            max_hw, min_hw = max(image.size), min(image.size)
+            max_hw, min_hw = max(frames.size), min(frames.size)
             aspect_ratio = max_hw / min_hw
             max_len, min_len = 448, 224
             shortest_edge = int(min(max_len / aspect_ratio, min_len))
-            image = processor.preprocess(
-                image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
-            )['pixel_values'][0]
+            frames = processor.preprocess(
+                frames, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+            )['pixel_values']
         elif neva_cfg.data.image_aspect_ratio == 'pad':
 
             def expand2square(pil_img, background_color):
@@ -453,18 +501,44 @@ def expand2square(pil_img, background_color):
                     result.paste(pil_img, ((height - width) // 2, 0))
                     return result
 
-            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
-            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            frames = expand2square(frames, tuple(int(x * 255) for x in processor.image_mean))
+            frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
         else:
-            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
 
-        if neva_cfg.precision in [16, '16', '16-mixed']:
-            media = image.type(torch.float16)
-        elif neva_cfg.precision in [32, '32', '32-true']:
-            media = image.type(torch.float32)
+        media_tensors = frames.type(torch_dtype_from_precision(neva_cfg.precision))
+        return media_tensors.unsqueeze(dim=0).unsqueeze(dim=0)
+
+    return model, image_processor, video_processor
+
+
+def create_image_processor(mm_cfg):
+    if mm_cfg.vision_encoder.get("from_hf", False):
+        if "clip" in mm_cfg.vision_encoder.from_pretrained:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+        elif "siglip" in mm_cfg.vision_encoder.from_pretrained:
+            image_processor = SiglipImageProcessor.from_pretrained(
+                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
         else:
-            media = image.type(torch.bfloat16)
+            raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))
 
-        return media.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0)
+        crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
+        if hasattr(image_processor, 'crop_size'):
+            assert crop_size == (
+                image_processor.crop_size['height'],
+                image_processor.crop_size['width'],
+            ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
 
-    return model, image_processor
+    else:
+        # Corresponds to MegatronCLIPModel
+        crop_size = mm_cfg.get("crop_size", (224, 224))
+        image_processor = image_transform(
+            crop_size,
+            is_train=False,
+            mean=None,
+            std=None,
+        )
+    return image_processor
diff --git a/nemo/export/trt_llm/nemo/__init__.py b/nemo/collections/multimodal/speech_llm/__init__.py
similarity index 87%
rename from nemo/export/trt_llm/nemo/__init__.py
rename to nemo/collections/multimodal/speech_llm/__init__.py
index 19059dfa144a..f0c19a3eebb9 100644
--- a/nemo/export/trt_llm/nemo/__init__.py
+++ b/nemo/collections/multimodal/speech_llm/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.collections.multimodal.speech_llm import models, modules
diff --git a/nemo/collections/multimodal/speech_llm/data/__init__.py b/nemo/collections/multimodal/speech_llm/data/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
new file mode 100644
index 000000000000..94d2cd50a240
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
@@ -0,0 +1,1125 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import io
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import webdataset as wds
+from omegaconf import DictConfig, ListConfig, open_dict
+
+from nemo.collections.asr.data.audio_to_text import (
+    VALID_FILE_FORMATS,
+    cache_datastore_manifests,
+    expand_sharded_filepaths,
+    shard_manifests_if_needed,
+)
+from nemo.collections.asr.data.audio_to_text_dataset import ConcatDataset, convert_to_config_list, get_chain_dataset
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    TextProcessing,
+    build_loss_mask,
+    ceil_to_nearest,
+    get_num_samples_from_files,
+    maybe_cast_to_list,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
+    get_datasets_weights_and_num_samples,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.utils import logging, logging_mode
+from nemo.utils.distributed import webdataset_split_by_workers
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+__all__ = [
+    'AudioTextDataset',
+    'TarredAudioTextDataset',
+    'get_tarred_audio_text_dataset_from_config',
+    'get_audio_text_dataset_from_config',
+]
+
+
+def _audio_collate_fn(audio_signals, audio_lengths):
+    """collate batch of audio sig, audio len, tokens, tokens len
+    Args:
+        audio_signals: List[Tensor]
+        audio_lengths: List[Tensor]
+    """
+
+    max_audio_len = 0
+    has_audio = audio_lengths[0] is not None
+    if has_audio:
+        max_audio_len = max(audio_lengths).item()
+
+    audio_signals_padded = []
+    for sig, sig_len in zip(audio_signals, audio_lengths):
+        if has_audio:
+            sig_len = sig_len.item()
+            if sig_len < max_audio_len:
+                pad = (0, max_audio_len - sig_len)
+                sig = torch.nn.functional.pad(sig, pad)
+            audio_signals_padded.append(sig)
+
+    if has_audio:
+        audio_signals_padded = torch.stack(audio_signals_padded)
+        audio_lengths = torch.stack(audio_lengths)
+    else:
+        audio_signals_padded, audio_lengths = None, None
+
+    return audio_signals_padded, audio_lengths
+
+
+def _collate_item(item: Union[torch.Tensor, np.ndarray, List], max_length: int, pad_id: int = 0):
+    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+    item = maybe_cast_to_list(item)
+    # max_length = max([len(x) for x in item]) if item else 0
+    # here [0] should be tokenizer.pad_id
+    item = [x + [pad_id] * (max_length - len(x)) for x in item]
+    return item
+
+
+def _speechllm_audio_text_collate_fn(
+    batch: Dict,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+    text_pad_id: int,
+):
+    sample_ids = [x["idx"] for x in batch]
+    sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+
+    audio_signal = [x["audio_signal"] for x in batch]
+    audio_lengths = [x["audio_length"] for x in batch]
+    audio_signal, audio_lengths = _audio_collate_fn(audio_signal, audio_lengths)
+
+    input_ids = [item['input_ids'][:-1] for item in batch]
+    labels = [item['input_ids'][1:] for item in batch]
+    contexts = [item['context_ids'] for item in batch]
+    context_lengths = torch.LongTensor([item['context_length'] for item in batch])
+    answers = [item['answer_ids'] for item in batch]
+
+    loss_mask = [build_loss_mask(item)[1:] for item in batch]
+
+    max_length = max([len(x) for x in input_ids]) + tokens_to_generate
+    # increase max length to nearest multiple of 4 or 8
+    if pad_to_max_length:
+        max_length = max_seq_length
+    else:
+        max_length = min(max_seq_length, ceil_to_nearest(max_length, 8))
+    assert max_length <= max_seq_length
+
+    position_ids = [list(range(max_length)) for _ in batch]
+    position_ids = torch.LongTensor(position_ids)
+    input_ids = torch.LongTensor(_collate_item(input_ids, max_length=max_length, pad_id=text_pad_id))
+    input_length = torch.LongTensor([len(x) for x in input_ids])
+    labels = torch.LongTensor(_collate_item(labels, max_length=max_length, pad_id=text_pad_id))
+    loss_mask = torch.LongTensor(_collate_item(loss_mask, max_length=max_length, pad_id=0))
+    contexts = torch.LongTensor(_collate_item(contexts, max_length=max_length, pad_id=text_pad_id))
+    answers = torch.LongTensor(_collate_item(answers, max_length=max_length, pad_id=text_pad_id))
+
+    batch = {
+        'sample_ids': sample_ids,
+        'audio_signal': audio_signal,
+        'audio_signal_length': audio_lengths,
+        'tokens': input_ids,
+        'tokens_length': input_length,
+        'labels': labels,
+        'loss_mask': loss_mask,
+        'position_ids': position_ids,
+        'contexts': contexts,
+        'context_lengths': context_lengths,
+        'answers': answers,
+        'max_length': torch.LongTensor(max_length),
+        'metadata': [x['metadata'] for x in batch],
+    }
+
+    return batch
+
+
+def _speechllm_multi_audio_text_collate_fn(
+    batch: Dict,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+    text_pad_id: int,
+):
+    """Collate function for multi audio case."""
+    context_start_idx = [item['context_start_idx'] for item in batch]
+
+    audio_signals = [x["audio_signal"] for x in batch]
+    audio_lengths = [x["audio_length"] for x in batch]
+    num_audios = [len(x) for x in audio_signals]
+
+    # put all audios from all samples in one batch
+    audio_signals_merged = [item for audio_list in audio_signals for item in audio_list]
+    audio_lengths_merged = [item for length_list in audio_lengths for item in length_list]
+    audio_signals_merged, audio_lengths_merged = _audio_collate_fn(audio_signals_merged, audio_lengths_merged)
+
+    for i in range(len(batch)):
+        # create dummy audio_signal and audio_length for _speechllm_audio_text_collate_fn()
+        batch[i]["audio_signal"] = audio_signals[i][0]
+        batch[i]["audio_length"] = audio_lengths[i][0]
+
+    batch = _speechllm_audio_text_collate_fn(batch, tokens_to_generate, pad_to_max_length, max_seq_length, text_pad_id)
+
+    # add multi audio specific fields
+    batch['context_start_idx'] = list(context_start_idx)
+    batch['num_audios'] = torch.LongTensor(num_audios)
+    batch['audio_signal'] = audio_signals_merged
+    batch['audio_signal_length'] = audio_lengths_merged
+
+    return batch
+
+
+class AudioTextDataset(TextProcessing, Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"}
+    {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        tokenizer: text tokenizer object
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded
+            audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include in dataset
+        max_utts: Limit number of utterances
+        trim: whether or not to trim silence. Defaults to False
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+        --------- NLP SPECIFIC ARGS -------------
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        tokens_to_generate (int): (inference only) Number of tokens to generate during inference
+        seed: Random seed for data shuffling.
+        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
+        seed: int = 1234,
+        context_key: Key to use for the context in your JSONL file
+        answer_key: Key to use for the label in your JSONL file
+        separate_prompt_and_response_with_newline: Adds a newline between prompt and response.
+        answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
+        truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        end_string: Optional[str] = None, if not None, add this string to the end of the answer.
+        --------------- additional args for misc purposes ----------------
+        context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
+        sample_alpha: Optional[float] = None, for SPE subword sampling
+        audio_locator: Optional[str] = None, a special string to split the context into multiple audio segments.
+    """
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: Optional[int] = None,
+        max_num_samples: Optional[int] = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        index_by_file_id: bool = False,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        context_file: Optional[Union[List[str], str]] = None,
+        sample_alpha: Optional[float] = None,
+        audio_locator: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+            min_seq_length=min_seq_length,
+            add_bos=add_bos,
+            add_eos=add_eos,
+            add_sep=add_sep,
+            sep_id=sep_id,
+            seed=seed,
+            separate_prompt_and_response_with_newline=separate_prompt_and_response_with_newline,
+            answer_only_loss=answer_only_loss,
+            truncation_field=truncation_field,
+            pad_to_max_length=pad_to_max_length,
+            prompt_template=prompt_template,
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=tokens_to_generate,
+            context_key=context_key,
+            answer_key=answer_key,
+            end_string=end_string,
+            sample_alpha=sample_alpha,
+            audio_locator=audio_locator,
+        )
+
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(",")
+
+        # If necessary, cache manifests and audio from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True)
+
+        self.collection = collections.SpeechLLMAudioTextCollection(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+            max_num_samples=max_num_samples,
+            context_file=context_file,
+            context_key=context_key,
+            answer_key=answer_key,
+        )
+
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+        self.channel_selector = channel_selector
+
+    def get_manifest_sample(self, sample_id):
+        return self.collection[sample_id]
+
+    def __getitem__(self, index):
+        output = {"idx": index}
+        sample = self.collection[index]
+        offset = sample.offset
+
+        if offset is None:
+            offset = 0
+
+        if sample.audio_file is not None:
+            features = self.featurizer.process(
+                sample.audio_file,
+                offset=offset,
+                duration=sample.duration,
+                trim=self.trim,
+                orig_sr=sample.orig_sr,
+                channel_selector=self.channel_selector,
+            )
+            f, fl = features, torch.tensor(features.shape[0]).long()
+            output["audio_signal"] = f
+            output["audio_length"] = fl
+        else:
+            # dummy features
+            output["audio_signal"] = torch.zeros([80])
+            # accomodates normalize_batch
+            output["audio_length"] = torch.tensor(80)
+
+        text_data = self._process_example(context=sample.context, output=sample.answer)
+
+        output.update(text_data)
+        output['metadata'] = {
+            'audio_filepath': sample.audio_file,
+            'offset': offset,
+            'duration': sample.duration,
+        }
+        return output
+
+    def __len__(self):
+        return len(self.collection)
+
+    def _collate_fn(self, batch):
+        return _speechllm_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def collate_fn(self, batch):
+        # override collate_fn to skip type checking
+        return self._collate_fn(batch)
+
+
+class MultiAudioTextDataset(AudioTextDataset):
+    """
+    Dataset for having multi audios per sample, for example in few-shot in-context learning.
+    To use this dataset, you need to specify the `audio_locator` field in the dataset config,
+    and use that to specify the locations of the audio files in your manifest. In this case,
+    the `audio_filepath` field in the manifest is a list of audio filepaths, and the `duration`
+    field is a list of durations, one for each audio file. The `offset` field is optional, and
+    if not specified, it is assumed to be 0.0. The `offset` field is also a list of offsets if specified.
+
+    Example manifest item for audio_locator='|audio|':
+    {
+    "audio_filepath": ["1.wav","2.wav","3.wav"],
+    "duration": [1.05,1.05,2.0],
+    "answer": "this was her dream as nearly as she could recall it",
+    "question": "Following are examples of speech audios and their transcriptions.
+        Example 1: audio is |audio|, transcription is 'I have a dream'.
+        Example 2: audio is |audio|, transcription is ' I don't have a dream'.
+        Given the following audio |audio|, transcribe the audio into words."
+    }
+    """
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+    def _collate_fn(self, batch):
+        return _speechllm_multi_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def __getitem__(self, index):
+        output = {"idx": index}
+        sample = self.collection[index]
+        offsets = sample.offset if sample.offset else 0.0
+        durations = sample.duration if sample.duration else 0.0
+        num_audios = 0
+        output["audio_signal"] = []
+        output["audio_length"] = []
+        if sample.audio_file is not None:
+            audio_list = sample.audio_file
+            if isinstance(sample.audio_file, str):
+                audio_list = [sample.audio_file]
+            if not isinstance(audio_list, list):
+                raise ValueError(
+                    f"The field `audio_file` must be either a str or a list of str, but got type {type(sample.audio_file)} instead"
+                )
+
+            num_audios = len(audio_list)
+            if isinstance(durations, list) and len(durations) != num_audios:
+                raise ValueError(
+                    f"The number of durations ({len(durations)}) must match the number of audio clips ({num_audios})"
+                )
+            if isinstance(offsets, list) and len(offsets) != num_audios:
+                raise ValueError(
+                    f"The number of offsets ({len(offsets)}) must match the number of audio clips ({num_audios})"
+                )
+
+            for i, audio_file in enumerate(audio_list):
+                duration = durations[i] if isinstance(durations, list) else 0
+                offset = offsets[i] if isinstance(offsets, list) else 0
+                features = self.featurizer.process(
+                    audio_file,
+                    offset=offset,
+                    duration=duration,
+                    trim=self.trim,
+                    orig_sr=sample.orig_sr,
+                    channel_selector=self.channel_selector,
+                )
+                f, fl = features, torch.tensor(features.shape[0]).long()
+                output["audio_signal"].append(f)
+                output["audio_length"].append(fl)
+        else:
+            # dummy features
+            output["audio_signal"] = [torch.zeros([8])]
+            # accomodates normalize_batch
+            output["audio_length"] = [torch.tensor(8)]
+
+        text_data = self._process_example(context=sample.context, output=sample.answer)
+
+        if isinstance(output["audio_signal"], list) and len(output["audio_signal"]) + 1 != len(
+            text_data['context_start_idx']
+        ):
+            raise ValueError(
+                f"The number of text segments ({len(text_data['context_start_idx'])}) must be one more than number of audios ({len(output['audio_signal'])})"
+            )
+
+        output.update(text_data)
+        output['metadata'] = {
+            'audio_filepath': sample.audio_file,
+            'offset': offsets,
+            'duration': sample.duration,
+        }
+        return output
+
+
+class TarredAudioFilter:
+    def __init__(self, collection, iterator):
+        self.iterator = iterator
+        self.collection = collection
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            audio_bytes, audio_filename = next(self.iterator)
+            file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+            if file_id in self.collection.mapping:
+                return audio_bytes, audio_filename
+
+
+class TarredAudioLoopOffsets:
+    def __init__(self, collection, iterator):
+        self.iterator = iterator
+        self.collection = collection
+        self.current_fn = None
+        self.current_bytes = None
+        self.offset_id = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.current_fn is None:
+            self.current_bytes, self.current_fn = next(self.iterator)
+            self.offset_id = 0
+        else:
+            offset_list = self.collection.mapping[self.current_fn]
+            if len(offset_list) == self.offset_id + 1:
+                self.current_bytes, self.current_fn = next(self.iterator)
+                self.offset_id = 0
+            else:
+                self.offset_id += 1
+
+        return self.current_bytes, self.current_fn, self.offset_id
+
+
+class TarredAudioTextDataset(TextProcessing, IterableDataset):
+    """
+    A similar Dataset to the AudioTextDataset, but which loads tarred audio files.
+
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioTextDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+
+    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
+    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
+    Supported opening braces - { <=> (, [, < and the special tag _OP_.
+    Supported closing braces - } <=> ), ], > and the special tag _CL_.
+    For SLURM based tasks, we suggest the use of the special tags for ease of use.
+
+    See the WebDataset documentation for more information about accepted data and input formats.
+
+    If using multiple workers the number of shards should be divisible by world_size to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        parser (callable): A callable which is used to pre-process the text output.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        blank_index (int): Blank character index, defaults to -1.
+        unk_index (int): Unknown character index, defaults to -1.
+        normalize (bool): Dataset parameter.
+            Whether to use automatic text cleaning.
+            It is highly recommended to manually clean text for best results.
+            Defaults to True.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        bos_id (id): Dataset parameter.
+            Beginning of string symbol id used for seq2seq models.
+            Defaults to None.
+        eos_id (id): Dataset parameter.
+            End of string symbol id used for seq2seq models.
+            Defaults to None.
+        pad_id (id): Token used to pad when collating samples in batches.
+            If this is None, pads using 0s.
+            Defaults to None.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        --------- NLP SPECIFIC ARGS -------------
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        tokens_to_generate (int): (inference only) Number of tokens to generate during inference
+        seed: Random seed for data shuffling.
+        seed: int = 1234,
+        context_key: Key to use for the context in your JSONL file
+        answer_key: Key to use for the label in your JSONL file
+        separate_prompt_and_response_with_newline: Adds a newline between prompt and response.
+        answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
+        truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        end_string: Optional[str] = None, if not None, add this string to the end of the answer.
+        --------------- additional args for misc purposes ----------------
+        context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
+        sample_alpha: Optional[float] = None, for SPE subword sampling
+    """
+
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: int = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",  # choices=["answer", "context"]
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        context_file: Optional[Union[List[str], str]] = None,
+        sample_alpha: Optional[float] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+            min_seq_length=min_seq_length,
+            add_bos=add_bos,
+            add_eos=add_eos,
+            add_sep=add_sep,
+            sep_id=sep_id,
+            seed=seed,
+            separate_prompt_and_response_with_newline=separate_prompt_and_response_with_newline,
+            answer_only_loss=answer_only_loss,
+            truncation_field=truncation_field,
+            pad_to_max_length=pad_to_max_length,
+            prompt_template=prompt_template,
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=tokens_to_generate,
+            context_key=context_key,
+            answer_key=answer_key,
+            end_string=end_string,
+            sample_alpha=sample_alpha,
+        )
+        self.is_megatron_iterable = True
+        self.shard_manifests = shard_manifests
+
+        # Shard manifests if necessary and possible and then expand the paths
+        manifest_filepath = shard_manifests_if_needed(
+            shard_manifests=shard_manifests,
+            shard_strategy=shard_strategy,
+            manifest_filepaths=manifest_filepath,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        # If necessary, cache manifests from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath)
+
+        self.collection = collections.SpeechLLMAudioTextCollection(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            index_by_file_id=True,
+            context_file=context_file,
+            context_key=context_key,
+            answer_key=answer_key,
+        )
+
+        self.len = self._compute_len()
+
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        # Put together WebDataset
+        self._dataset = wds.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+
+        if shuffle_n == 0:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+
+        # Put together WebDataset pipeline
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            self._loop_offsets,
+            wds.map(self._build_sample),
+        )
+
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRAudioText already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+        return TarredAudioFilter(self.collection, iterator)
+
+    def _loop_offsets(self, iterator):
+        """This function is used to iterate through utterances with different offsets for each file."""
+        return TarredAudioLoopOffsets(self.collection, iterator)
+
+    def _collate_fn(self, batch):
+        return _speechllm_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def collate_fn(self, batch):
+        # override collate_fn to skip type checking
+        return self._collate_fn(batch)
+
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
+        audio_bytes, audio_filename, offset_id = tup
+
+        if audio_filename is not None:
+            # Grab manifest entry from self.manifest_preprocessor.collection
+            file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+            manifest_idx = self.collection.mapping[file_id][offset_id]
+            manifest_entry = self.collection[manifest_idx]
+
+            # init output dict
+            output = {"idx": manifest_idx}
+
+            offset = manifest_entry.offset
+            if offset is None:
+                offset = 0
+            # Convert audio bytes to IO stream for processing (for SoundFile to read)
+            audio_filestream = io.BytesIO(audio_bytes)
+            features = self.featurizer.process(
+                audio_filestream,
+                offset=offset,
+                duration=manifest_entry.duration,
+                trim=self.trim,
+                orig_sr=manifest_entry.orig_sr,
+            )
+            audio_filestream.close()
+
+            # Audio features
+            output["audio_signal"] = features
+            output["audio_length"] = torch.tensor(features.shape[0]).long()
+        else:
+            # dummy features
+            output["audio_signal"] = torch.zeros([80])
+            # accomodates normalize_batch
+            output["audio_length"] = torch.tensor(80)
+
+        # Text features
+        text_data = self._process_example(context=manifest_entry.context, output=manifest_entry.answer)
+
+        output.update(text_data)
+
+        output['metadata'] = {
+            'audio_filepath': audio_filename,
+            'offset': offset,
+            'duration': manifest_entry.duration,
+        }
+        return output
+
+    def get_manifest_sample(self, sample_id):
+        return self.collection[sample_id]
+
+    def __iter__(self):
+        return self._dataset.__iter__()
+
+    def _compute_len(self):
+        # TODO: need to figure out why here needs to be divided by world_size, while in ASR we don't need to.
+        if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized():
+            my_len = torch.tensor(len(self.collection), dtype=torch.int32).cuda()
+            torch.distributed.all_reduce(my_len)
+            my_len = my_len.int() // parallel_state.get_data_parallel_world_size()
+            logging.info(f'Sharded manifests: Total length: {my_len}')
+        else:
+            my_len = len(self.collection) // parallel_state.get_data_parallel_world_size()
+
+        return my_len
+
+    def __len__(self):
+        return self.len
+
+
+def get_tarred_audio_text_dataset(
+    config,
+    tokenizer,
+    augmentor,
+    global_rank=0,
+    world_size=1,
+    shuffle_n=0,
+    sep_id=None,
+    answer_only_loss=True,
+    virtual_tokens=0,
+):
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+
+    bucketing_weights = config.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+
+    if 'max_utts' in config:
+        raise ValueError('"max_utts" parameter is not supported for tarred datasets')
+
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        if len(manifest_filepath) == 1:
+            manifest_filepath = manifest_filepath[0]
+
+        dataset = TarredAudioTextDataset(
+            audio_tar_filepaths=tarred_audio_filepath,
+            manifest_filepath=manifest_filepath,
+            tokenizer=tokenizer,
+            sample_rate=config['sample_rate'],
+            int_values=config.get('int_values', False),
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            max_duration=config.get('max_duration', None),
+            min_duration=config.get('min_duration', None),
+            trim=config.get('trim_silence', False),
+            shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+            shard_manifests=config.get('shard_manifests', False),
+            global_rank=global_rank,
+            world_size=world_size,
+            max_seq_length=config.max_seq_length,
+            min_seq_length=config.min_seq_length,
+            add_bos=config.get('add_bos', False),
+            add_eos=config.get('add_eos', True),
+            add_sep=config.get('add_sep', False),
+            sep_id=sep_id,
+            separate_prompt_and_response_with_newline=config.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=answer_only_loss,
+            truncation_field=config.get('truncation_field', 'context'),
+            pad_to_max_length=False,
+            prompt_template=config.get('prompt_template', None),
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=config.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=config.get('context_key', 'context'),
+            answer_key=config.get('answer_key', 'answer'),
+            end_string=config.get('end_string', None),
+            sample_alpha=config.get('sample_alpha', None),
+            context_file=config.get('context_file', None),
+        )
+
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+
+    with open_dict(config):  # patch for bucketing tarred datasets
+        config['batch_size'] = config.get("micro_batch_size", 1)
+    return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank)
+
+
+def get_concat_tarred_audio_text_dataset(
+    config,
+    tokenizer,
+    augmentor,
+    global_rank=0,
+    world_size=1,
+    shuffle_n=0,
+    sep_id=None,
+    answer_only_loss=True,
+    virtual_tokens=0,
+):
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        conf['tarred_audio_filepaths'] = tarred_audio_filepath
+        context_files = config.get('context_file', None)
+        if isinstance(context_files, ListConfig) and len(context_files) == len(manifest_filepaths):
+            conf['context_file'] = context_files[dataset_idx]
+        else:
+            conf['context_file'] = context_files
+        dataset = get_tarred_audio_text_dataset(
+            config=conf,
+            tokenizer=tokenizer,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            augmentor=augmentor,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+        datasets.append(dataset)
+
+    concat_sampling_probabilities = config.get('concat_sampling_probabilities', None)
+    if not isinstance(concat_sampling_probabilities, ListConfig) or len(concat_sampling_probabilities) != len(
+        datasets
+    ):
+        logging.info(
+            f"concat_sampling_probabilities is not provided or is not of the same size as datasets, using uniform sampling."
+        )
+        concat_sampling_probabilities = [1.0 / len(datasets)] * len(datasets)
+
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_scale=config.get('concat_sampling_scale', 1),
+        sampling_probabilities=concat_sampling_probabilities,
+        shuffle=config.get('concat_shuffle', True),
+        seed=config.get('concat_sampling_seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+    )
+    return dataset
+
+
+def get_tarred_audio_text_dataset_from_config(
+    config: DictConfig,
+    tokenizer,
+    augmentor,
+    global_rank: int = 0,
+    world_size: int = 1,
+    sep_id: Optional[int] = None,
+    answer_only_loss: bool = True,
+    virtual_tokens: int = 0,
+):
+    is_concat = config.get('is_concat', False)
+    if is_concat:
+        if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None:
+            logging.warning(
+                f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}"
+            )
+            return None
+
+    data_parallel_size = parallel_state.get_data_parallel_world_size()
+    num_micro_batches = config.global_batch_size // (config.micro_batch_size * data_parallel_size)
+    global_batch_size_on_this_data_parallel_rank = num_micro_batches * config.micro_batch_size
+    shuffle = config['shuffle']
+    shuffle_n = config.get('shuffle_n', 4 * global_batch_size_on_this_data_parallel_rank) if shuffle else 0
+    if is_concat:
+        dataset = get_concat_tarred_audio_text_dataset(
+            config=config,
+            tokenizer=tokenizer,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+    else:
+        dataset = get_tarred_audio_text_dataset(
+            config=config,
+            tokenizer=tokenizer,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+    return dataset
+
+
+def get_audio_text_dataset_from_config(
+    manifest_filepath: str,
+    config: DictConfig,
+    tokenizer,
+    augmentor,
+    is_train,
+    sep_id: Optional[int] = None,
+    answer_only_loss: bool = True,
+    virtual_tokens: int = 0,
+):
+    if isinstance(config.manifest_filepath, str):
+        manifest_filepath = config.manifest_filepath.split(',')
+    else:
+        manifest_filepath = config.manifest_filepath
+
+    data_cls = MultiAudioTextDataset if config.get('audio_locator', None) else AudioTextDataset
+    datasets = []
+    if is_train:
+        # Construct the data prefix list for `get_datasets_weights_and_num_samples()`
+        # that is of the format [weight1,file_name1,weight2,file_name2,...]
+        concat_sampling_probabilities = config.get('concat_sampling_probabilities', None)
+        if concat_sampling_probabilities is None:
+            concat_sampling_probabilities = [1.0 / len(manifest_filepath)] * len(manifest_filepath)
+        elif len(config.get('concat_sampling_probabilities', None)) != len(manifest_filepath):
+            raise ValueError(
+                (
+                    f"concat_sampling_probabilities must be of the same size as manifest_filepath.",
+                    f"Provided size {len(config.concat_sampling_probabilities)}, number of datasets {len(manifest_filepath)}",
+                )
+            )
+        data_prefix = []
+        for weight, prefix in zip(concat_sampling_probabilities, manifest_filepath):
+            data_prefix.append(weight)
+            data_prefix.append(prefix)
+
+        num_samples_per_dataset = get_num_samples_from_files(manifest_filepath)
+        num_train_samples = [len(manifest_filepath) * max(num_samples_per_dataset)]
+        _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
+        num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
+    else:
+        num_train_samples_per_dataset = [[None]] * len(manifest_filepath)
+
+    for dataset_idx, (file_path, num_samples) in enumerate(zip(manifest_filepath, num_train_samples_per_dataset)):
+        context_file = config.get('context_file', None)
+        if isinstance(context_file, ListConfig) and len(context_file) == len(manifest_filepath):
+            context_file = context_file[dataset_idx]
+        dataset = data_cls(
+            manifest_filepath=file_path,
+            tokenizer=tokenizer,
+            sample_rate=config.sample_rate,
+            int_values=config.get('int_values', False),
+            augmentor=augmentor,
+            max_duration=getattr(config, 'max_duration', None),
+            min_duration=getattr(config, 'min_duration', None),
+            max_utts=getattr(config, 'max_utts', -1),
+            trim=getattr(config, 'trim_silence', False),
+            channel_selector=getattr(config, 'channel_selector', None),
+            max_seq_length=config.max_seq_length,
+            min_seq_length=config.min_seq_length,
+            add_bos=config.get('add_bos', False),
+            add_eos=config.get('add_eos', True),
+            add_sep=config.get('add_sep', False),
+            sep_id=sep_id,
+            max_num_samples=num_samples[0],
+            seed=config.get('seed', 1234),
+            separate_prompt_and_response_with_newline=config.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=answer_only_loss,
+            truncation_field=config.get('truncation_field', 'context'),
+            pad_to_max_length=config.get('pad_to_max_length', False),
+            prompt_template=config.get('prompt_template', None),
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=config.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=config.get('context_key', 'context'),
+            answer_key=config.get('answer_key', 'answer'),
+            end_string=config.get('end_string', None),
+            sample_alpha=config.get('sample_alpha', None),
+            context_file=context_file,
+            audio_locator=config.get('audio_locator', None),
+        )
+        datasets.append(dataset)
+
+    if is_train:
+        dataset = BlendableDataset(
+            datasets=datasets, weights=concat_sampling_probabilities, size=num_train_samples_after_blend
+        )
+        return dataset
+    else:
+        return datasets
diff --git a/nemo/collections/multimodal/speech_llm/data/build_dataset.py b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
new file mode 100644
index 000000000000..b042386cea3b
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from pathlib import Path
+
+import torch
+from megatron.core import parallel_state
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.multimodal.speech_llm.data.audio_text_dataset import (
+    get_audio_text_dataset_from_config,
+    get_tarred_audio_text_dataset_from_config,
+)
+from nemo.collections.multimodal.speech_llm.data.lhotse_dataset import LhotseAudioQuestionAnswerDataset
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import TextProcessing
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
+    MegatronPretrainingBatchSampler,
+)
+from nemo.utils import logging
+
+
+def build_speechllm_dataset(model_instance, data_cfg, is_train):
+    if 'augmentor' in data_cfg:
+        augmentor = process_augmentations(
+            data_cfg['augmentor'], global_rank=model_instance.global_rank, world_size=model_instance.world_size
+        )
+    else:
+        augmentor = None
+
+    # Check dataset max_seq_legnth and max_position_embeddings size
+    if (
+        model_instance.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
+        and data_cfg.max_seq_length > model_instance.cfg.max_position_embeddings
+    ):
+        logging.warning(
+            f"Set dataset max_seq_length to max_position_embeddings {model_instance.cfg.max_position_embeddings} if using learned_absolute position embedding"
+        )
+        data_cfg.max_seq_length = model_instance.cfg.max_position_embeddings
+
+    # Notably, the data weights are controlled by either bucketing_weights
+    # or concat_sampling_probabilities depending on the dataset type.
+    if data_cfg.get("use_lhotse"):
+        tp = TextProcessing(
+            model_instance.tokenizer,
+            max_seq_length=data_cfg["max_seq_length"],
+            min_seq_length=data_cfg["min_seq_length"],
+            add_bos=data_cfg.get('add_bos', False),
+            add_eos=data_cfg.get('add_eos', False),
+            add_sep=data_cfg.get('add_sep', False),
+            sep_id=model_instance.sep_id,
+            seed=data_cfg.get('seed', 1234),
+            separate_prompt_and_response_with_newline=data_cfg.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            truncation_field=data_cfg.get('truncation_field', 'context'),
+            pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+            prompt_template=data_cfg.get('prompt_template', None),
+            virtual_tokens=model_instance.virtual_tokens,
+            tokens_to_generate=data_cfg.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=data_cfg.get('context_key', 'context'),
+            answer_key=data_cfg.get('answer_key', 'answer'),
+            end_string=data_cfg.get('end_string', None),
+            sample_alpha=data_cfg.get('sample_alpha', None),
+        )
+        return LhotseAudioQuestionAnswerDataset(
+            tp,
+            default_context="answer the question according to the previous audio",
+            tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
+            pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+            max_seq_length=data_cfg["max_seq_length"],
+            context_key=data_cfg.get('context_key', "context"),
+            default_context_key=data_cfg.get('default_context_key', "default_context"),
+        )
+
+    # Notably, the data weights are controlled by either bucketing_weights
+    # or concat_sampling_probabilities depending on the dataset type.
+    if data_cfg.get('is_tarred', False):
+        return get_tarred_audio_text_dataset_from_config(
+            config=data_cfg,
+            tokenizer=model_instance.tokenizer,
+            augmentor=augmentor,
+            sep_id=model_instance.sep_id,
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            virtual_tokens=model_instance.virtual_tokens,
+            global_rank=parallel_state.get_data_parallel_rank(),
+            world_size=parallel_state.get_data_parallel_world_size(),
+        )
+    else:
+        return get_audio_text_dataset_from_config(
+            manifest_filepath=data_cfg.manifest_filepath,
+            config=data_cfg,
+            tokenizer=model_instance.tokenizer,
+            augmentor=augmentor,
+            is_train=is_train,
+            sep_id=model_instance.sep_id,
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            virtual_tokens=model_instance.virtual_tokens,
+        )
+
+
+def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict=False, is_eval=False):
+    """Buld dataloader given an input dataset."""
+    if data_cfg.get("use_lhotse"):
+        if is_eval == False and is_predict == False:
+            return get_lhotse_dataloader_from_config(
+                data_cfg,
+                global_rank=parallel_state.get_data_parallel_rank(),
+                world_size=parallel_state.get_data_parallel_world_size(),
+                dataset=dataset,
+            )
+        # for eval, we need to create separate dataset so as to report splitted numbers
+        else:
+            dls = []
+            if hasattr(data_cfg, 'manifest_filepath'):
+                manifest_filepath = data_cfg.manifest_filepath
+                for cur_manifest_filepath in manifest_filepath:
+                    conf = copy.deepcopy(data_cfg)
+                    conf['manifest_filepath'] = cur_manifest_filepath
+                    dls.append(
+                        get_lhotse_dataloader_from_config(
+                            conf,
+                            global_rank=parallel_state.get_data_parallel_rank(),
+                            world_size=parallel_state.get_data_parallel_world_size(),
+                            dataset=dataset,
+                        )
+                    )
+            else:
+                input_cfg = data_cfg.input_cfg
+                if isinstance(input_cfg, (str, Path)):
+                    # Resolve /path/to/input_cfg.yaml into config contents if needed.
+                    input_cfg = OmegaConf.load(input_cfg)
+                    assert len(input_cfg) == 1, "Only one dataset with multiple manifest paths is supported for eval"
+                    data_cfg.input_cfg = input_cfg
+                    # for getting names
+                    manifest_filepath = [ic.manifest_filepath for ic in input_cfg[0].input_cfg]
+                for cur_input_cfg in input_cfg[0].input_cfg:
+                    conf = copy.deepcopy(data_cfg)
+                    conf.input_cfg[0].input_cfg = [cur_input_cfg]
+                    dls.append(
+                        get_lhotse_dataloader_from_config(
+                            conf,
+                            global_rank=parallel_state.get_data_parallel_rank(),
+                            world_size=parallel_state.get_data_parallel_world_size(),
+                            dataset=dataset,
+                        )
+                    )
+
+            if 'names' not in data_cfg:
+                names = []
+                for cur_manifest_filepath in manifest_filepath:
+                    names.append(Path(cur_manifest_filepath).stem)
+                OmegaConf.update(data_cfg, 'names', names, force_add=True)
+                logging.info(f'Update dataset names as {names}')
+            return dls
+
+    logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
+    if isinstance(dataset, BlendableDataset):
+        collate_fn = dataset.datasets[0].collate_fn
+    elif hasattr(dataset, 'collate_fn'):
+        collate_fn = dataset.collate_fn
+    elif hasattr(dataset.datasets[0], 'collate_fn'):
+        # support datasets that are lists of entries
+        collate_fn = dataset.datasets[0].collate_fn
+    else:
+        # support datasets that are lists of lists
+        collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+    if isinstance(dataset, torch.utils.data.IterableDataset):
+        data_parallel_size = parallel_state.get_data_parallel_world_size()
+        num_micro_batches = data_cfg.global_batch_size // (data_cfg.micro_batch_size * data_parallel_size)
+        global_batch_size_on_this_data_parallel_rank = num_micro_batches * data_cfg.micro_batch_size
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            shuffle=False,
+            batch_size=global_batch_size_on_this_data_parallel_rank,
+            drop_last=True,
+            num_workers=data_cfg.num_workers,
+            pin_memory=data_cfg.pin_memory,
+        )
+        return dataloader
+
+    if is_predict:
+        # MegatronPretrainingBatchSampler doesn't work with trainer.predict()
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            batch_size=data_cfg.micro_batch_size,
+            num_workers=data_cfg.num_workers,
+            pin_memory=data_cfg.pin_memory,
+        )
+        return dataloader
+
+    batch_sampler = MegatronPretrainingBatchSampler(
+        total_samples=len(dataset),
+        consumed_samples=consumed_samples,
+        micro_batch_size=data_cfg.micro_batch_size,
+        global_batch_size=data_cfg.global_batch_size,
+        data_parallel_rank=parallel_state.get_data_parallel_rank(),
+        data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        drop_last=data_cfg.drop_last,
+        pad_samples_to_global_batch_size=not data_cfg.drop_last,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=collate_fn,
+        num_workers=data_cfg.num_workers,
+        pin_memory=data_cfg.pin_memory,
+        persistent_workers=True if data_cfg.num_workers > 0 else False,
+    )
+    return dataloader
diff --git a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
new file mode 100644
index 000000000000..d3e70343d507
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
@@ -0,0 +1,166 @@
+import torch.utils.data
+from lhotse.dataset import AudioSamples
+from lhotse.dataset.collation import collate_vectors as collate_vectors_lhotse
+
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    TextProcessing,
+    build_loss_mask,
+    ceil_to_nearest,
+)
+
+
+def collate_vectors(items, max_length: int, padding_value):
+    vectors = collate_vectors_lhotse(items, padding_value=padding_value)
+    if max_length > vectors.size(1):
+        vectors = torch.cat(
+            [vectors, padding_value * torch.ones(vectors.size(0), max_length - vectors.size(1), dtype=vectors.dtype)],
+            dim=1,
+        )
+    if items[0].shape[0] < 1:
+        vectors = vectors.long()
+    return vectors
+
+
+class LhotseAudioQuestionAnswerDataset(torch.utils.data.Dataset):
+    """
+    This dataset is based on Lhotse ASR dataset from ``audio_to_text_lhotse.py``
+    and ``TarredAudioQuestionAnswerDataset`` from ``audio_text_qa_dataset.py``.
+
+    Unlike native NeMo datasets, Lhotse dataset defines only the mapping from
+    a CutSet (meta-data) to a mini-batch with PyTorch tensors.
+    Specifically, it performs tokenization, I/O, augmentation, and feature extraction (if any).
+    Managing data, sampling, de-duplication across workers/nodes etc. is all handled
+    by Lhotse samplers instead.
+
+    Args:
+        text_processor: TextProcessing object
+        default_context: Default question to use if no question is provided
+        tokens_to_generate: Number of tokens to generate during inference
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        max_seq_length: Maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        context_key: Key to use for the context in your JSONL file
+        default_context_key: Key to use for the default context in lhotse yaml
+    """
+
+    def __init__(
+        self,
+        text_processor: TextProcessing,
+        default_context: str,
+        tokens_to_generate: int,
+        pad_to_max_length: bool,
+        max_seq_length: int,
+        context_key: str = "context",
+        default_context_key: str = "default_context",
+    ):
+        super().__init__()
+        self.text_processor = text_processor
+        self.load_audio = AudioSamples(fault_tolerant=True)
+        self.tokens_to_generate = tokens_to_generate
+        self.pad_to_max_length = pad_to_max_length
+        self.max_seq_length = max_seq_length
+
+        self.default_context = default_context
+        self.context_key = context_key
+        self.default_context_key = default_context_key
+
+    def __getitem__(self, cuts) -> dict[str, torch.Tensor | list[str] | dict]:
+        cuts = cuts.sort_by_duration()
+
+        audio, audio_lens, cuts = self.load_audio(cuts)
+
+        return_batch = {}
+        audio_ratio = []
+        for id, cut in enumerate(cuts):
+            audio_ratio.append(1.0)
+
+        for _, cut in enumerate(cuts):
+            if hasattr(cut, self.context_key):
+                cut.context = getattr(cut, self.context_key)
+            elif hasattr(cut, self.default_context_key):
+                cut.context = getattr(cut, self.default_context_key)
+            else:
+                cut.context = self.default_context
+
+        metadata = []
+        for id, cut in enumerate(cuts):
+            metadata.append({'audio_filepath': cut.id + '.wav'})
+
+        collated_text_data = collate_text_data(
+            cuts=cuts,
+            default_context=self.default_context,
+            text_processor=self.text_processor,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+        )
+        return_batch.update(
+            {
+                "sample_ids": list(cuts.ids),
+                "audio_signal": audio,
+                "audio_signal_length": audio_lens,
+                "audio_ratio": torch.FloatTensor(audio_ratio),
+                "metadata": metadata,
+                **collated_text_data,
+            }
+        )
+
+        return return_batch
+
+
+def collate_text_data(
+    cuts,
+    default_context: str,
+    text_processor: TextProcessing,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+) -> dict:
+    """Perform text collation equivalent to nemo/collections/multimodal/data/audio_text_qa_dataset.py:121"""
+    batch_size = len(cuts)
+    pad_id = text_processor.pad_id
+    examples = [
+        {
+            k: torch.as_tensor(v)
+            for k, v in text_processor._process_example(
+                context=cut.context,
+                output=cut.supervisions[0].text,
+            ).items()
+        }
+        for cut in cuts
+    ]
+    fields = as_dict(examples)
+
+    def get_max_len(input_list):
+        return max([len(x) for x in input_list])
+
+    max_length = tokens_to_generate + max(
+        get_max_len(fields["input_ids"]), get_max_len(fields["context_ids"]), get_max_len(fields["answer_ids"])
+    )
+    # increase max length to nearest multiple of 4 or 8
+    if pad_to_max_length:
+        max_length = max_seq_length
+    else:
+        max_length = min(max_seq_length, ceil_to_nearest(max_length, 8))
+
+    all_tokens = collate_vectors(fields["input_ids"], max_length=max_length, padding_value=pad_id)
+    full_lengths = torch.LongTensor([len(item) for item in fields["input_ids"]])
+
+    assert max_length <= max_seq_length, f"{max_length=} <= {max_seq_length=}"
+
+    return {
+        "tokens": all_tokens[:, :-1],
+        "tokens_length": full_lengths - 1,
+        "labels": all_tokens[:, 1:],
+        "loss_mask": collate_vectors(
+            [torch.as_tensor(build_loss_mask(item)) for item in examples], max_length=max_length, padding_value=0
+        )[:, 1:],
+        "position_ids": torch.arange(max_length, dtype=torch.long).repeat(batch_size, 1),
+        "contexts": collate_vectors(fields["context_ids"], max_length=max_length, padding_value=pad_id),
+        "context_lengths": torch.LongTensor([len(seq) for seq in fields["context_ids"]]),
+        "answers": collate_vectors(fields["answer_ids"], max_length=max_length, padding_value=pad_id),
+        "max_length": torch.LongTensor([max_length] * batch_size),
+    }
+
+
+def as_dict(arg: list[dict]) -> dict[str, list]:
+    return {k: [item[k] for item in arg] for k in arg[0].keys()}
diff --git a/nemo/collections/multimodal/speech_llm/models/__init__.py b/nemo/collections/multimodal/speech_llm/models/__init__.py
new file mode 100644
index 000000000000..ec188828ec87
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
new file mode 100644
index 000000000000..cce74e7b6a1d
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -0,0 +1,1552 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import os
+from typing import List, Optional, Union
+
+import hydra
+import sacrebleu
+import torch
+from hydra.utils import get_class
+from omegaconf import ListConfig
+from omegaconf.dictconfig import DictConfig
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities import rank_zero_only
+
+from nemo.collections.asr.models import ASRModel, EncDecSpeakerLabelModel
+from nemo.collections.asr.parts.mixins.transcription import move_to_device
+from nemo.collections.asr.parts.utils.eval_utils import remove_punctuations
+from nemo.collections.common.metrics import MetricStringToTorchMetric, TextMetricsSet
+from nemo.collections.multimodal.speech_llm.data.build_dataset import (
+    build_speechllm_dataloader,
+    build_speechllm_dataset,
+)
+from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_utils import generate
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+)
+from nemo.collections.multimodal.speech_llm.parts.mixins.adapter_mixin import SpeechLLMAdapterMixin
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import get_nested_dict_value
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    build_position_ids,
+)
+from nemo.collections.nlp.modules.common.text_generation_utils import get_computeprob_response
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes import ModelPT
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.classes.mixins import adapter_mixins
+from nemo.utils import AppState, logging, model_utils
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+try:
+    from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator, get_num_microbatches
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+try:
+    from megatron.core import InferenceParams, parallel_state, tensor_parallel
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+
+__all__ = ["ModularAudioGPTModel"]
+
+
+default_inference_config = {'tokens_to_generate': 30}
+
+
+class ModularAudioGPTModel(SpeechLLMAdapterMixin, MegatronGPTSFTModel):
+    """Modularized speech GPT model."""
+
+    def setup_perception_modules(self, cfg):
+        if 'target' in cfg.perception:
+            imported_cls = model_utils.import_class_by_path(cfg.perception.target)
+            self.perception = imported_cls(cfg=cfg.perception)
+        else:
+            self.perception = (
+                AudioPerceptionModule(cfg=cfg.perception)
+                if "encoders" not in cfg.perception
+                else MultiAudioPerceptionModule(cfg=cfg.perception)
+            )
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+        super().__init__(cfg, trainer)
+        # handle the case where the batch size from dynamic bucketting is not divisible in lhotse
+        self.enforce_divisible_batch = False
+        self.setup_perception_modules(cfg)
+
+        # print out params in more details
+        self.summarize(max_depth=2)
+
+    def parameters(self):
+        # override the same method in MegatronGPT model to include parameters ouside of LM
+        all_names = []
+        all_params = []
+        for name, param in self.named_parameters(recurse=True):
+            all_names.append(name)
+            all_params.append(param)
+
+        if isinstance(self.model, list):
+            for module in self.model:
+                for name, param in module.named_parameters(recurse=True):
+                    all_names.append(name)
+                    all_params.append(param)
+
+        return itertools.chain(all_params)
+
+    def setup_optimizer_param_groups(self):
+        """
+        Override parent method to setup optimizer groups for training/freezing different parts of the model.
+        """
+        known_groups = []
+        self.unfreeze()
+        freeze_llm = self.cfg.get('freeze_llm', True)
+        if freeze_llm:
+            known_groups.append('model.')
+
+        for param in self.model.parameters():
+            param.requires_grad = not freeze_llm
+
+        if self.cfg.get('freeze_audio_encoder', False):
+            # freeze speaker model if there is any
+            if self.cfg.perception.get("speaker_model", None) is not None:
+                if self.cfg.perception.speaker_model.get("freeze", False):
+                    self.perception.speaker_model.freeze()
+                    known_groups.append('perception.speaker_model.')
+            # freeze other audio encoders
+            if self.cfg.perception.get("encoders", None) is not None:
+                # multiple audio encoders
+                for key, enc_cfg in self.cfg.perception.encoders.items():
+                    if enc_cfg.get("freeze", False):
+                        self.perception.encoders[key].freeze()
+                        known_groups.append(f'perception.encoders.{key}.')
+            else:
+                # single audio encoder
+                self.perception.encoder.freeze()
+                known_groups.append('perception.encoder.')
+
+        if self.cfg.get('freeze_modality_adapter', False):
+            # freeze modality adapter
+            self.perception.modality_adapter.freeze()
+            known_groups.append('perception.modality_adapter.')
+
+        opt_params = []
+        for _, module in self.named_modules():
+            if isinstance(module, adapter_mixins.AdapterModuleMixin) and module.is_adapter_available():
+                # add adapters to the optimizer
+                module.set_enabled_adapters(enabled=True)
+                module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
+                opt_params += [p for p in module.parameters()]
+
+        # add param groups with specified args, if any
+        param_groups = []
+        if "optim_param_groups" in self.cfg:
+            param_groups_cfg = self.cfg.optim_param_groups
+            for group, group_cfg in param_groups_cfg.items():
+                module = getattr(self, group, None)
+                if module is None:
+                    raise ValueError(f"{group} not found in model.")
+                elif hasattr(module, "parameters"):
+                    known_groups.append(f"{group}.")
+                    new_group = {"params": module.parameters()}
+                    for k, v in group_cfg.items():
+                        new_group[k] = v
+                    param_groups.append(new_group)
+                else:
+                    raise ValueError(f"{group} does not have parameters.")
+
+        # add other trainable params
+        for n, p in self.named_parameters():
+            is_unknown = True
+            for group in known_groups:
+                if n.startswith(group):
+                    is_unknown = False
+            if is_unknown:
+                opt_params.append(p)
+
+        param_groups = [{"params": opt_params}] + param_groups
+
+        self._optimizer_param_groups = param_groups
+        logging.info(f"Optimizer groups set:\n{self.summarize(max_depth=2)}")
+
+    def _create_attention_mask(self, encoder_input: torch.Tensor):
+        # Create causal attention mask for whole input
+        batch_size = encoder_input.shape[0]
+        max_len = encoder_input.shape[1]
+        attention_mask = torch.tril(torch.ones((batch_size, max_len, max_len), device=encoder_input.device)).view(
+            batch_size, 1, max_len, max_len
+        )
+        # Convert attention mask from float to bool
+        attention_mask = attention_mask < 0.5
+        return attention_mask
+
+    def _concat_features(self, embs1, emb1_lens, embs2, emb2_lens):
+        """Concatenate two sets of embeddings and their lengths."""
+        concat_emb = []
+        concat_len = []
+        for emb1, emb1_len, emb2, emb2_len in zip(embs1, emb1_lens, embs2, emb2_lens):
+            new_len = emb1_len + emb2_len
+            new_emb = torch.concat([emb1[:emb1_len], emb2[:emb2_len]], axis=0)
+            padded_new_emb = torch.zeros(emb1.shape[0] + emb2.shape[0], emb1.shape[-1], device=emb1.device)
+            padded_new_emb[:new_len, ...] = new_emb
+            concat_emb.append(padded_new_emb)
+            concat_len.append(new_len)
+        concat_emb = torch.stack(concat_emb, dim=0)
+        concat_len = torch.stack(concat_len, dim=0)
+        return concat_emb, concat_len
+
+    def _concat_multi_features(
+        self,
+        encoded: List[torch.Tensor],
+        encoded_len: List[torch.Tensor],
+        input_embeds: torch.Tensor,
+        input_length: torch.Tensor,
+        context_start_idx: List[List[int]],
+    ):
+        """Concatenate multiple audio features with text segments."""
+        encoder_input_list, encoder_length_list = [], []
+        batch_size = input_embeds.size(0)
+        max_length = 0
+        for i in range(batch_size):
+            start_idx_list_i = context_start_idx[i] + [
+                input_embeds.size(1)
+            ]  # use input_embeds instead of input_length to handle tokens_to_generate in inference
+            input_len_list = [start_idx_list_i[j + 1] - start_idx_list_i[j] for j in range(len(start_idx_list_i) - 1)]
+            input_emb_list = input_embeds[i].split(input_len_list)
+            encoder_input_i = [input_emb_list[0]]
+            for j in range(1, len(input_emb_list)):
+                encoder_input_i.append(encoded[i][j - 1][: encoded_len[i][j - 1]])
+                encoder_input_i.append(input_emb_list[j])
+            encoder_input_i = torch.cat(encoder_input_i)  # T, C
+            encoder_length_i = encoded_len[i].sum() + input_length[i]  # total length of audio and text features
+            max_length = max(max_length, encoder_input_i.size(0))
+            encoder_input_list.append(encoder_input_i)
+            encoder_length_list.append(encoder_length_i)
+
+        encoder_input = torch.stack(
+            [torch.nn.functional.pad(f, (0, 0, 0, max_length - f.size(0))) for f in encoder_input_list]
+        )
+        encoder_length = torch.LongTensor(encoder_length_list).to(encoder_input.device)
+        return encoder_input, encoder_length
+
+    def inject_perception_input(
+        self,
+        encoded: Union[torch.Tensor, List[torch.Tensor]],
+        encoded_len: Union[torch.Tensor, List[torch.Tensor]],
+        input_ids: torch.Tensor,
+        input_length: torch.Tensor,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """Inject audio features into the text input and return the final input embeddings to LLM."""
+        # [b, t, c]
+        lm_embedding = (
+            self.model.language_model.embedding if hasattr(self.model, 'language_model') else self.model.embedding
+        )
+        input_embeds = lm_embedding.word_embeddings(input_ids)
+        if isinstance(encoded, torch.Tensor):
+            # single audio
+            encoder_input, encoder_length = self._concat_features(encoded, encoded_len, input_embeds, input_length)
+        else:
+            # concat multiple audios with text segments
+            encoder_input, encoder_length = self._concat_multi_features(
+                encoded, encoded_len, input_embeds, input_length, context_start_idx
+            )
+
+        attention_mask = self._create_attention_mask(encoder_input)
+        position_ids = build_position_ids(encoder_input[:, :, 0])
+
+        # Add position embeddings
+        if (
+            getattr(lm_embedding, "position_embeddings", None) is not None
+            and lm_embedding.position_embedding_type == 'learned_absolute'
+        ):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            encoder_input = encoder_input + position_embeddings
+
+        encoder_max_length = encoder_input.shape[1]
+        if not hasattr(lm_embedding, 'transpose_batch_sequence') or lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.transpose(0, 1).contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, encoder_length, position_ids, encoder_max_length
+
+    def _shift_labels_by_emb_len(self, labels, label_lens, emb_lens, max_len, pad_token=0):
+        """Shift labels to the right by the length of the audio embeddings."""
+        shifted_labels = []
+        for label, label_len, emb_len in zip(labels, label_lens, emb_lens):
+            shifted_label = torch.full([max_len], pad_token, device=label.device)
+            shifted_label[emb_len : emb_len + label_len] = label[:label_len]
+            shifted_labels.append(shifted_label)
+        shifted_labels = torch.stack(shifted_labels, dim=0)
+        return shifted_labels
+
+    def _get_text_embeddings(self, text_tokens, position_ids):
+        """Get text embeddings for the input text tokens."""
+        lm_embedding = (
+            self.model.language_model.embedding if hasattr(self.model, 'language_model') else self.model.embedding
+        )
+        text_embeddings = lm_embedding.word_embeddings(text_tokens)  # (batch_size, seq_len, hidden_size)
+        if hasattr(lm_embedding, 'position_embeddings'):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            text_embeddings = text_embeddings + position_embeddings
+        return text_embeddings.transpose(0, 1)
+
+    def prepare_llm_input(self, audio_batch):
+        """Prepare input for the LLM."""
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['tokens'],
+            audio_batch['tokens_length'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        num_audios = audio_batch.get("num_audios", None)
+        context_start_idx = audio_batch.get("context_start_idx", None)
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+
+        if num_audios is not None:
+            # split the encoded and encoded_len by num_audios, used when there're multiple audio files per sample
+            encoded = encoded.split(num_audios.tolist())
+            encoded_len = encoded_len.split(num_audios.tolist())
+
+        encoder_input, attention_mask, encoder_length, _, encoder_max_length = self.inject_perception_input(
+            encoded, encoded_len, input_ids, input_length, context_start_idx
+        )
+        if num_audios is not None:
+            # sum up the audio_feat_lens for each sample in the batch
+            encoded_len = torch.stack([torch.sum(lens) for lens in encoded_len])
+
+        # Shift labels to the right
+        labels = self._shift_labels_by_emb_len(labels, input_length, encoded_len, encoder_max_length, pad_token=0)
+        # Loss mask where answer tokens are 1.0 and all other tokens are 0.0
+        loss_mask = self._shift_labels_by_emb_len(
+            loss_mask, input_length, encoded_len, encoder_max_length, pad_token=0
+        )
+
+        return encoder_input, attention_mask, labels, loss_mask, encoder_length
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """
+        Forward pass of the model. We prepend audio embeddings to the instruction and label text tokens as the LLM input.
+        """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
+        encoder_input, attention_mask, labels, loss_mask, _ = self.prepare_llm_input(audio_batch)
+        if self.mcore_gpt:
+            output = self.model(
+                input_ids=None,
+                position_ids=None,
+                decoder_input=encoder_input,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+        else:
+            output = self.model(
+                input_ids=None,
+                position_ids=None,
+                encoder_input=encoder_input,
+                attention_mask=attention_mask,
+                labels=labels,
+                checkpoint_activations_all_layers=checkpoint_activations_all_layers,
+            )
+
+        return output, loss_mask
+
+    def get_forward_output_only_func(self):
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            extra_arg = {}
+            # take the batch produced by prepare_batch_at_step
+            (
+                tokens,
+                input_embeddings,
+                attention_mask,
+                position_ids,
+                set_inference_key_value_memory,
+                inference_max_sequence_len,
+            ) = batch
+            tokens = tokens.cuda()
+
+            if attention_mask is not None:
+                attention_mask = attention_mask.cuda()
+                attention_mask = attention_mask[0:1]
+            if self.mcore_gpt:
+                # if first step, then clear KV cache, otherwise reuse inference_paarms
+                if set_inference_key_value_memory[0].item():
+                    self.inference_params = InferenceParams(
+                        max_batch_size=tokens.size(0), max_sequence_length=inference_max_sequence_len[0].item()
+                    )
+                extra_arg['inference_params'] = self.inference_params
+            else:
+                extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
+                extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
+
+            # Currently for all MCore transformer layer specs causal attention mask
+            # is used so we can delegate creating it to MCore/TE and pass None below
+            if (
+                isinstance(model, MCoreGPTModel)
+                or hasattr(model, "module")
+                and isinstance(model.module, MCoreGPTModel)
+            ):
+                attention_mask = None
+
+            output_tensor = model(
+                input_ids=None,
+                position_ids=None,
+                decoder_input=input_embeddings,
+                attention_mask=attention_mask,
+                **extra_arg,
+            )
+
+            # Advance inference sequence offset.
+            if self.inference_params:
+                # if last stage, then (final) output is [b, s, h], otherwise it's [s, b, h]
+                if parallel_state.is_pipeline_last_stage():
+                    self.inference_params.sequence_len_offset += output_tensor.size(1)
+                else:
+                    self.inference_params.sequence_len_offset += output_tensor.size(0)
+
+            def id_func(output_tensor):
+                return output_tensor, {'logits': output_tensor}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+            batch = next(dataloader_iter)
+
+            # Transfer needed data to GPU
+            required_keys = set()
+            if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+                required_keys.update(batch.keys())
+            else:
+                required_keys.add('attention_mask')
+                if parallel_state.is_pipeline_first_stage():
+                    required_keys.update(('tokens', 'position_ids'))
+                if parallel_state.is_pipeline_last_stage():
+                    required_keys.update(('labels', 'loss_mask'))
+            if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
+                required_keys.remove('attention_mask')
+
+            batch = move_to_device(batch, self.device)
+            batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+            if not self.mcore_gpt:
+                batch['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+
+            output_tensor, loss_mask = self.forward(
+                batch, checkpoint_activations_all_layers=checkpoint_activations_all_layers
+            )
+            batch['loss_mask'] = loss_mask
+
+            def loss_func(output_tensor):
+                # Loss for a micro-batch (ub)
+                loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor)
+                cp_size = self.cfg.get('context_parallel_size', 1)
+                if self.cfg.data.get(
+                    "return_output_tensors", False
+                ):  # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare)
+                    loss_for_ub, q_hs, d_hs, pos_cs, neg_cs, diff_cs = loss_for_ub
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    pos_cs = average_losses_across_data_parallel_group([pos_cs])
+                    neg_cs = average_losses_across_data_parallel_group([neg_cs])
+                    diff_cs = average_losses_across_data_parallel_group([diff_cs])
+                    return (
+                        loss_for_ub * cp_size,
+                        {
+                            'avg': reduced_loss,
+                            'query_hs': q_hs,
+                            'doc_hs': d_hs,
+                            'avg_pos_cs': pos_cs,
+                            'avg_neg_cs': neg_cs,
+                            'diff_cs': diff_cs,
+                        },
+                    )
+                elif validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                    else:
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub * cp_size, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        return build_speechllm_dataset(self, data_cfg, is_train)
+
+    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_predict=False, is_eval=False):
+        return build_speechllm_dataloader(dataset, data_cfg, consumed_samples, is_predict=is_predict, is_eval=is_eval)
+
+    @classmethod
+    def _modify_audio_encoder_config(cls, gpt_cfg, audio_cfg, speaker_cfg=None):
+        """load the ecoder configs from the pretrained audio models and updating the model's config."""
+        with open_dict(gpt_cfg):
+            use_multi_encoder = gpt_cfg.perception.get("encoders", None) is not None
+            if not use_multi_encoder:
+                gpt_cfg.perception.preprocessor = audio_cfg.preprocessor
+                gpt_cfg.perception.encoder = audio_cfg.encoder
+            else:
+                for key in gpt_cfg.perception.encoders:
+                    model_key = gpt_cfg.perception.encoders[key].get("model_key", "encoder")
+                    gpt_cfg.perception.encoders[key]["model"] = audio_cfg[key][model_key]
+                    if "preprocessor" in audio_cfg[key]:
+                        gpt_cfg.perception.encoders[key]['preprocessor'] = audio_cfg[key].preprocessor
+                if speaker_cfg is not None:
+                    gpt_cfg.perception.speaker_model.model = speaker_cfg
+
+            gpt_cfg.perception.output_dim = gpt_cfg.hidden_size
+            modality_adapter_cfg = gpt_cfg.perception.modality_adapter
+            if 'output_dim' in modality_adapter_cfg:
+                modality_adapter_cfg.output_dim = gpt_cfg.hidden_size
+            if not use_multi_encoder:
+                model_dim_key = gpt_cfg.perception.get("model_dim_key", "d_model")
+                encoder_dim = get_nested_dict_value(audio_cfg.encoder, model_dim_key)
+                input_dim = encoder_dim
+                if (
+                    gpt_cfg.perception.get('use_multi_layer_feat', False)
+                    and gpt_cfg.perception.multi_layer_feat.aggregator.get("mode", "cat") == "cat"
+                ):
+                    input_dim = encoder_dim * len(gpt_cfg.perception.multi_layer_feat.layer_idx_list)
+            else:
+                input_dim = 0
+                if speaker_cfg is not None:
+                    input_dim += speaker_cfg.decoder.emb_sizes
+                for enc_cfg in gpt_cfg.perception.encoders.values():
+                    encoder_dim = get_nested_dict_value(enc_cfg.model, enc_cfg.get("model_dim_key", "d_model"))
+                    if (
+                        enc_cfg.get('use_multi_layer_feat', False)
+                        and enc_cfg.multi_layer_feat.aggregator.get("mode", "cat") == "cat"
+                    ):
+                        input_dim += encoder_dim * len(enc_cfg.multi_layer_feat.layer_idx_list)
+                    else:
+                        input_dim += encoder_dim
+
+            if 'feat_in' in modality_adapter_cfg:
+                modality_adapter_cfg.feat_in = input_dim
+            elif 'input_dim' in modality_adapter_cfg:
+                modality_adapter_cfg.input_dim = input_dim
+
+    @classmethod
+    def _modify_config(cls, gpt_cfg, cfg, audio_cfg, add_cfg_to_tree=False, speaker_cfg=None):
+        """
+        This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
+        The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+        """
+        OmegaConf.set_struct(gpt_cfg, True)
+        OmegaConf.resolve(cfg)
+        with open_dict(gpt_cfg):
+            # for AudioGPTLoRAModel
+            gpt_cfg.target = f"{cls.__module__}.{cls.__name__}"
+            gpt_cfg.perception = cfg.model.perception
+            # inject audio encoder configs into the target config (gpt_cfg)
+            cls._modify_audio_encoder_config(gpt_cfg, audio_cfg, speaker_cfg)
+
+            # inject the sample rate from the audio encoder into the gpt config
+            if isinstance(audio_cfg, (ListConfig, list)):
+                sample_rate = [_cfg.preprocessor.sample_rate for _cfg in audio_cfg]
+                if not all([sr == sample_rate[0] for sr in sample_rate]):
+                    raise ValueError("All audio encoders must have the same sample rate.")
+                gpt_cfg.data.train_ds.sample_rate = sample_rate[0]
+                gpt_cfg.data.validation_ds.sample_rate = sample_rate[0]
+            else:
+                sample_rate = audio_cfg.preprocessor.sample_rate
+                gpt_cfg.data.train_ds.sample_rate = sample_rate
+                gpt_cfg.data.validation_ds.sample_rate = sample_rate
+
+            # This is needed when modifying a hparam file directly to load `.ckpt` files.
+            # This is not needed to modify the cfg in `.nemo` files.
+            if add_cfg_to_tree:
+                OmegaConf.resolve(gpt_cfg)
+                gpt_cfg.cfg = gpt_cfg
+
+        return gpt_cfg
+
+    @classmethod
+    def get_pretraind_audio_model(cls, encoder_cfg: DictConfig) -> ModelPT:
+        """load pretrained audio model from a given config"""
+        if encoder_cfg.get("_target_", None) is not None:
+            encoder_cls = get_class(encoder_cfg.get("_target_"))
+        elif encoder_cfg.get("target", None) is not None:
+            encoder_cls = get_class(encoder_cfg.get("target"))
+        else:
+            encoder_cls = ASRModel
+
+        pretrained_model = encoder_cfg.get('pretrained_model', None)
+        if pretrained_model is None:
+            return None
+        if encoder_cls is None:
+            raise ValueError(
+                f"Must specify a valid encoder class in the via the `_target_` field in the config: {encoder_cfg}"
+            )
+
+        if pretrained_model.endswith('.nemo'):
+            logging.info(f'Loading pretrained audio model from local file: {pretrained_model}')
+            audio_model = encoder_cls.restore_from(pretrained_model, map_location='cpu')
+        else:
+            logging.info(f'Loading pretrained audio model from NGC: {pretrained_model}')
+            audio_model = encoder_cls.from_pretrained(pretrained_model, map_location='cpu')
+        return audio_model
+
+    @classmethod
+    def get_speaker_model_and_config(cls, cfg):
+        """load speaker embedding model and config if present in the config."""
+        if 'speaker_model' in cfg.model.perception:
+            if cfg.model.get("_target_", None) is not None:
+                model_cls = get_class(cfg.model.get("_target_"))
+            elif cfg.model.get("target", None) is not None:
+                model_cls = get_class(cfg.model.get("target"))
+            else:
+                model_cls = EncDecSpeakerLabelModel
+
+            speaker_cfg = cfg.model.perception.speaker_model
+            if speaker_cfg.get('pretrained_model', None) is not None:
+                if speaker_cfg.pretrained_model.endswith('.nemo'):
+                    logging.info(f'Loading pretrained speaker model from local file: {speaker_cfg.pretrained_model}')
+                    speaker_model = model_cls.restore_from(speaker_cfg.pretrained_model, map_location='cpu')
+                else:
+                    logging.info(f'Loading pretrained speaker model from NGC: {speaker_cfg.pretrained_model}')
+                    speaker_model = model_cls.from_pretrained(speaker_cfg.pretrained_model, map_location='cpu')
+                return speaker_model, speaker_model.cfg
+            return None, None
+        else:
+            return None, None
+
+    @classmethod
+    def get_audio_encoder_models_and_configs(cls, cfg):
+        if 'encoders' in cfg.model.perception:
+            audio_encoders = {}
+            audio_enc_cfgs = {}
+            for key, encoder_cfg in cfg.model.perception.encoders.items():
+                audio_encoders[key] = cls.get_pretraind_audio_model(encoder_cfg)
+                audio_enc_cfgs[key] = audio_encoders[key].cfg
+            return audio_encoders, audio_enc_cfgs
+        else:
+            pretrained_audio_model = cfg.model.get("pretrained_audio_model", None)
+            pretrained_audio_model_class = cfg.model.get(
+                "pretrained_audio_model_target", "nemo.collections.asr.models.ASRModel"
+            )
+
+            model_class = hydra.utils.get_class(pretrained_audio_model_class)
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = model_class.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = model_class.from_pretrained(pretrained_audio_model, map_location='cpu')
+            return audio_model, audio_model.cfg
+
+    @classmethod
+    def load_pretrained_audio_weights(
+        cls, cfg, model, audio_model, speaker_model: Optional[EncDecSpeakerLabelModel] = None
+    ):
+        model.perception.tokenizer = audio_model.tokenizer
+        use_multi_encoder = cfg.model.perception.get("encoders", None) is not None
+        if not use_multi_encoder:
+            if cfg.model.perception.get("use_multi_layer_feat", False):
+                model.perception.encoder.encoder.load_state_dict(audio_model.encoder.state_dict(), strict=True)
+            else:
+                model.perception.encoder.load_state_dict(audio_model.encoder.state_dict(), strict=True)
+            logging.info(f'Loaded pretrained audio model weights from {cfg.model.pretrained_audio_model}')
+            if cfg.model.get('use_am_tokenizer', False):
+                model.tokenizer = audio_model.tokenizer
+                logging.info(f'Use AM tokenizer: {audio_model.tokenizer}')
+            return model
+        else:
+            for key, enc_cfg in cfg.model.perception.encoders.items():
+                if enc_cfg.get("use_multi_layer_feat", False):
+                    model.perception.encoders[key].encoder.load_state_dict(
+                        audio_model[key].encoder.state_dict(), strict=True
+                    )
+                else:
+                    model.perception.encoders[key].load_state_dict(audio_model[key].encoder.state_dict(), strict=True)
+                logging.info(f'Loaded pretrained audio model weights for {key}')
+            if speaker_model is not None:
+                model.perception.speaker_model.load_state_dict(speaker_model.state_dict(), strict=True)
+                logging.info(f'Loaded pretrained speaker model weights')
+            return model
+
+    @classmethod
+    def restore_from_pretrained_models(
+        cls,
+        cfg: Optional[Union[OmegaConf, str]] = None,
+        trainer: Optional[Trainer] = None,
+    ):
+        """
+        load pretrained LLM and audio encoders, and maybe add adapters, used for training.
+        Args:
+            cfg: input yaml config, with trainer, model, exp_manager, etc.
+            trainer: trainer object
+        """
+        if (
+            cfg.model.get("pretrained_audio_model", None) is None
+            and cfg.model.perception.get("encoders", None) is None
+        ):
+            raise RuntimeError("PEFT training needs at least one pretrained audio model present.")
+
+        if not cfg.model.restore_from_path:
+            raise RuntimeError("PEFT training needs a trained base model present.")
+
+        base_model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+        audio_model, audio_model_cfg = cls.get_audio_encoder_models_and_configs(cfg)
+        speaker_model, speaker_cfg = cls.get_speaker_model_and_config(cfg)
+        model_cfg = cls._modify_config(
+            base_model_cfg, cfg, audio_model_cfg, add_cfg_to_tree=False, speaker_cfg=speaker_cfg
+        )
+
+        # load llm
+        model = cls.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+
+        if "peft" in cfg.model:
+            peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+            if cfg.model.peft.restore_from_path is not None:
+                # initialize peft weights from a checkpoint instead of randomly
+                # This is not the same as resume training because optimizer states are not restored.
+                logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+                model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+            elif peft_cfg_cls is not None:
+                logging.info("Adding adapter weights to the model for PEFT")
+                model.add_adapter(peft_cfg_cls(model_cfg))
+            else:
+                raise ValueError(f"PEFT scheme not not found in PEFT_CONFIG_MAP: {cfg.model.peft.peft_scheme}")
+        else:
+            logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+        # load audio model weights
+        model = cls.load_pretrained_audio_weights(cfg, model, audio_model, speaker_model)
+
+        if 'inference' in cfg:
+            inference_cfg = OmegaConf.to_container(cfg.inference, resolve=True)
+            model.set_inference_config(inference_cfg)
+        return model
+
+    @classmethod
+    def load_audio_encoder_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, model: ModelPT) -> ModelPT:
+        """
+        Maybe load audio encoders for inference, if they were not tunable during training.
+        Args:
+            cfg: inference config
+            model_cfg: model config
+            model: model object
+        Returns:
+            model: model object with audio encoder weights loaded
+        """
+        if model_cfg.freeze_audio_encoder and model_cfg.get("pretrained_audio_model", None) is not None:
+            with open_dict(cfg):
+                cfg.model.perception = model_cfg.perception
+
+            audio_model, _ = cls.get_audio_encoder_models_and_configs(cfg)
+            speaker_model, _ = cls.get_speaker_model_and_config(cfg)
+            model = cls.load_pretrained_audio_weights(cfg, model, audio_model, speaker_model)
+        return model
+
+    @classmethod
+    def merge_inference_cfg(
+        cls, cfg: DictConfig, trainer: Trainer, pretrained_model_cfg: DictConfig = None
+    ) -> DictConfig:
+        """
+        Merge the inference config with the model config, used for inference only.
+        if no pretrained_model_cfg is given, it will be loaded from the checkpoint specified in cfg.
+        Args:
+            cfg: inference config
+            trainer: trainer object
+            pretrained_model_cfg: a pre-loaded SpeechLLM model config
+        Returns:
+            model_cfg: merged model config
+        """
+        if pretrained_model_cfg:
+            model_cfg = pretrained_model_cfg
+        elif cfg.model.peft.restore_from_path:
+            if cfg.model.peft.restore_from_path.endswith(".nemo"):
+                model_cfg = ModularAudioGPTModel.restore_from(
+                    restore_path=cfg.model.peft.restore_from_path,
+                    trainer=trainer,
+                    return_config=True,
+                )
+            elif cfg.model.peft.restore_from_hparams_path:  # not a .nemo model we expect a hparams.yaml file
+                model_cfg = OmegaConf.to_container(OmegaConf.load(cfg.model.peft.restore_from_hparams_path).cfg)
+                model_cfg = OmegaConf.create(model_cfg)
+                # extract dict inside cfg key and convert it to DictConfig
+                # this allows interpolation to work the same way as config from the .restore_from method
+            else:
+                raise RuntimeError(
+                    "This script requires a .nemo peft model or path to hparams.yaml (and a ckpt path)."
+                )
+        else:
+            model_cfg = MegatronGPTSFTModel.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                return_config=True,
+            )
+        # overwrite pretrained_audio_model if there
+        if hasattr(cfg.model, "pretrained_audio_model"):
+            model_cfg.pretrained_audio_model = cfg.model.pretrained_audio_model
+        if hasattr(model_cfg, 'peft') and model_cfg.peft.peft_scheme not in [None, 'none']:
+            # before PEFT migrates to distributed ckpt, eval must use same TP/PP as training
+            for p in ['tensor_model_parallel_size', 'pipeline_model_parallel_size']:
+                assert model_cfg.get(p) == cfg.model.get(
+                    p
+                ), f"PEFT evaluation {p} ({cfg.model.get(p)}) must equal training {p} ({model_cfg.get(p)})"
+
+        with open_dict(model_cfg):
+            # to be compatible with old checkpoints
+            if "context_key" not in model_cfg.data.train_ds or "answer_key" not in model_cfg.data.train_ds:
+                model_cfg.data.train_ds.context_key = "question"
+                model_cfg.data.train_ds.answer_key = "answer"
+
+            # update the model config of the trained model with params we want to set at inference time.
+            model_cfg.precision = cfg.trainer.precision
+            for key, val in cfg.model.items():
+                if key != 'data' and key != 'peft':
+                    model_cfg[key] = val
+            model_cfg.data.test_ds = cfg.model.data.test_ds
+
+        with open_dict(cfg):
+            if model_cfg.data.test_ds is not None:
+                cfg.inference.add_BOS = model_cfg.data.test_ds.get("add_BOS", False)
+                cfg.inference.tokens_to_generate = model_cfg.data.test_ds.get("tokens_to_generate", 1)
+
+        model_cfg.megatron_amp_O2 = False  # always evaluate with O1
+        return model_cfg
+
+    @classmethod
+    def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, model: ModelPT) -> ModelPT:
+        if cfg.model.peft.restore_from_path:
+            if '\\' in cfg.model.peft.restore_from_path:
+                cfg.model.peft.restore_from_path = cfg.model.peft.restore_from_path.replace('\\', '')
+            if "peft" in model_cfg and 'peft_scheme' in model_cfg.peft:
+                peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
+                model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+            else:
+                torch_state_dict = torch.load(cfg.model.peft.restore_from_path)['state_dict']
+                model.load_state_dict(torch_state_dict, strict=False)
+        elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+            checkpoint_path = os.path.join(
+                cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+            )
+            # checkpoint_path is a dir in case of distributed checkpointing
+            if not os.path.isdir(checkpoint_path):
+                # legacy checkpoint needs model parallel rank injection
+                checkpoint_path = inject_model_parallel_rank(
+                    os.path.join(
+                        cfg.model.peft.restore_from_ckpt.checkpoint_dir,
+                        cfg.model.peft.restore_from_ckpt.checkpoint_name,
+                    )
+                )
+                if "peft" in model_cfg:
+                    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+                    model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg), map_location="cpu")
+                else:
+                    model.load_state_dict(torch.load(checkpoint_path), strict=False)
+            else:
+                raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+        elif model_cfg.peft.get("peft_scheme", None):
+            # special case for loading a complete speechllm checkpoint in nemo format
+            peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
+            model.load_adapters(cfg.model.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+        return model
+
+    def _build_vocab(self):
+        """
+        Manipulate vocabulary (e.g., pad vocabulary for increased performance)/
+        """
+        if self._cfg.get('override_vocab_size', None) is not None:
+            self.padded_vocab_size = self._cfg.override_vocab_size
+        else:
+            self.padded_vocab_size = self._vocab_size_with_padding(
+                orig_vocab_size=self.tokenizer.vocab_size,
+                make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128),
+                tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1),
+            )
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        """
+        Overwrite the state_dict method to include only the trainable parameters.
+        """
+        if self.setup_complete and self.trainer.state.fn == "fit":
+            # Once setup is complete we only need adapter and perception model.
+            if self.cfg.freeze_llm and self.cfg.get("peft", None) is not None:
+                return_state_dict = self.get_peft_state_dict()
+            elif not self.cfg.freeze_llm:
+                return_state_dict = self.model.state_dict(prefix="model.")
+            else:
+                return_state_dict = {}
+
+            state_dict = self.perception.state_dict(prefix="perception.")
+            if self.cfg.freeze_audio_encoder:
+                state_dict = {k: v for k, v in state_dict.items() if not k.startswith("perception.encoder.")}
+
+            return_state_dict.update(state_dict)
+            state_dict = self.perception.state_dict(prefix="perception.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        elif self.setup_complete and self.trainer.state.fn != "fit":
+            # used to save the whole model as a nemo file
+            return_state_dict = self.model.state_dict(prefix="model.")
+            state_dict = self.perception.state_dict(prefix="perception.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        else:
+            # we want all the params with the same keys as calling self.state_dict()
+            # but we can't call self.state_dict() here as it would be a recursive call.
+            # so we call self.model.state_dict(prefix="model.") which will return all the keys and params same as calling self.state_dict()
+            if not self.cfg.freeze_llm:
+                return_state_dict = self.model.state_dict(prefix="model.")
+            else:
+                return_state_dict = {}
+            state_dict = self.perception.state_dict(prefix="perception.")
+            if self.cfg.freeze_audio_encoder:
+                state_dict = {k: v for k, v in state_dict.items() if not k.startswith("perception.encoder.")}
+            return_state_dict.update(state_dict)
+            return return_state_dict
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        if not self.setup_complete:
+            if self.cfg.get('override_vocab_size', False):
+                exclude_list = [
+                    "model.language_model.embedding.word_embeddings.weight",
+                    "model.language_model.output_layer.weight",
+                ]
+            else:
+                exclude_list = []
+            state_dict = {k: v for k, v in state_dict.items() if k not in exclude_list}
+        else:
+            strict = False
+
+        if len(state_dict) == 0:
+            return  # checkpoint is loaded in on_load_checkpoint()
+        if self.use_peft and self.setup_complete:
+            # at this stage only adapter params will appear in the state_dict arg
+            # so we only update those while the rest of the model is frozen.
+            # setting strict=False will ignore the missing keys (which are not being updated anyway)
+            # explicitly check if state_dict.keys matches all the expected self.adapter_keys since we don't have the
+            # safety in strict=True anymore.
+            if not self.ptuning_only_and_non_first_stage:
+                if set(state_dict.keys()) != self.adapter_keys.union(self.tunable_base_param_keys):
+                    logging.warning(
+                        f"Unexpected keys found in state_dict: {set(state_dict.keys()) - self.adapter_keys.union(self.tunable_base_param_keys)}, missing keys in state_dict: {self.adapter_keys.union(self.tunable_base_param_keys) - set(state_dict.keys())}"
+                    )
+                super(MegatronGPTModel, self).load_state_dict(state_dict, strict=False)
+        else:
+            super(MegatronGPTModel, self).load_state_dict(state_dict, strict=strict)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+        checkpoint_state_dict = checkpoint['state_dict']
+        self.load_state_dict(checkpoint_state_dict, strict=False)
+
+    def setup_metric(self, data_cfg):
+        metric_name = "exact_string_match"
+        if not hasattr(data_cfg, "metric"):
+            metric = MetricStringToTorchMetric["exact_string_match"]
+        else:
+            if not hasattr(data_cfg.metric, "name"):
+                raise ValueError("Metric name is not provided in the metric config.")
+            if data_cfg.metric.name == "loss":
+                return None, "loss"
+            if data_cfg.metric.name not in MetricStringToTorchMetric:
+                raise KeyError(
+                    f"{data_cfg.metric.name} is not supported. List of supported metrics: {MetricStringToTorchMetric.keys()}"
+                )
+            if data_cfg.metric.name in self._metrics_require_string2category_map:
+                if data_cfg.metric.average is None:
+                    raise ValueError(
+                        f"{data_cfg.metric.name} requires specifying whether you want to compute a micro or macro average. Found None."
+                    )
+            if (
+                data_cfg.metric.get('labels_are_strings', False)
+                and data_cfg.metric.name in self._metrics_require_string2category_map
+            ):
+                if data_cfg.metric.num_classes is None:
+                    raise ValueError(
+                        "Number of classes is not provided in the metric section within the data config. "
+                        f"Please provide the number of classes in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if data_cfg.metric.get('class_labels', None) is None or not isinstance(
+                    data_cfg.metric.get('class_labels', None), ListConfig
+                ):
+                    raise ValueError(
+                        "Class labels are not provided properly in the metric section witnin the data config. "
+                        f"Please provide the class labels as a list of strings in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if len(data_cfg.metric.get('class_labels', None)) != data_cfg.metric.num_classes:
+                    raise ValueError(
+                        f"Number of class labels {len(data_cfg.metric.get('class_labels', None))} does not match `num_classes` : {data_cfg.metric.num_classes}"
+                    )
+
+            metric_name = data_cfg.metric.name
+            metric_cls = MetricStringToTorchMetric[metric_name]
+            if metric_name not in TextMetricsSet:
+                metric = [metric_cls(**data_cfg.metric)]
+            else:
+                metric = [metric_cls()]
+        return metric, metric_name
+
+    def inference_step(self, dataloader_iter, mode):
+        """
+        Used for validation and test steps, added postprocessing after calling self.predict_step().
+        """
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
+        self._reconfigure_and_process_inference_batch(batch, data_cfg)
+        # Meta data from dataset
+        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
+        loss = super(MegatronGPTSFTModel, self).validation_step(itertools.chain([batch]), dataloader_idx)
+
+        # We need _inference_config to get generation params
+        # add_BOS and tokens_to_generate are set in dataset
+        if self.get_inference_config() is None:
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            self.set_inference_config(inference_config=default_inference_config)
+        self._inference_config['add_BOS'] = data_cfg.add_bos
+        self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate')
+
+        output = self.predict_step(batch, batch_idx, dataloader_idx)
+
+        inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
+        labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
+        preds_text = [
+            self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')])
+            for t, l in zip(output['token_ids'], batch['context_lengths'])
+        ]
+
+        if data_cfg.get("end_string", None):
+            # sometimes data_cfg.end_string != self.tokenizer.ids_to_text(self.tokenizer.text_to_ids(data_cfg.end_string))
+            # for example when data_cfg.end_string = "<end>", the end_string_re will start with " ?? "
+            end_string_re = self.tokenizer.ids_to_text(self.tokenizer.text_to_ids(data_cfg.end_string))
+            preds_text_cleaned = []
+            labels_text_cleaned = []
+            for p, l in zip(preds_text, labels_text):
+                # remove end_string from the end of the string
+                for es in [end_string_re, data_cfg.end_string]:
+                    if p.endswith(es):
+                        p = p[: -len(es)].strip()
+                    if l.endswith(es):
+                        l = l[: -len(es)].strip()
+                preds_text_cleaned.append(p)
+                labels_text_cleaned.append(l)
+            preds_text = preds_text_cleaned
+            labels_text = labels_text_cleaned
+
+        if data_cfg.get("remove_text_pc", False):
+            preds_text = [remove_punctuations(p.lower(), data_cfg.get("punctuations", None)) for p in preds_text]
+            labels_text = [remove_punctuations(l.lower(), data_cfg.get("punctuations", None)) for l in labels_text]
+
+        if data_cfg.get("log_every_n_steps", None) is not None:
+            if batch_idx % data_cfg.log_every_n_steps == 0:
+                logging.info(f"Input: `{inputs_text[0]}`")
+                logging.info(f"Label: `{labels_text[0]}`")
+                logging.info(f"Pred: `{preds_text[0]}`")
+
+        # if loss is nan, print the input, label and pred
+        if loss.isnan():
+            logging.info("++++++++++++++ NaN loss detected ++++++++++++++")
+            for i in range(len(inputs_text)):
+                logging.info(f"Input: `{inputs_text[i]}`")
+                logging.info(f"Label: `{labels_text[i]}`")
+                logging.info(f"Pred: `{preds_text[i]}`")
+            logging.info("++++++++++++++++++++++++++++++++++++++++++++++++")
+
+        outputs = {
+            'loss': loss,
+            'preds': preds_text,  # [str]
+            'labels': labels_text,  # [str]
+            'inputs': inputs_text,  # [str]
+            'metadata': metadata,  # [dict]
+        }
+
+        if mode == 'validation':
+            if len(self._validation_dl) > 1:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[-1] = outputs
+        else:
+            if len(self._test_dl) > 1:
+                self.test_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                self.test_step_outputs[-1] = outputs
+        return outputs
+
+    def predict_step(self, batch: dict, batch_idx: int, dataloader_idx: Optional[int] = None):
+        """
+        Used to get LLM predictions for validation and test steps based on the given inference config.
+        """
+        inference_config = self.get_inference_config()
+        if inference_config is not None:
+            # need to overwrite some configuration, make it immutable
+            inference_config = inference_config.copy()
+        else:
+            self.set_inference_config(inference_config=default_inference_config)
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            inference_config = self.get_inference_config()
+
+        if self.cfg.data.get('end_string', None):
+            inference_config['end_strings'] = [self.cfg.data.end_string]
+
+        global_batch_size_per_gpu = batch['tokens'].size(0)
+        num_micro_batches_before_decode = get_num_microbatches()
+
+        compute_logprob = inference_config.get('compute_logprob', False)
+        if compute_logprob:
+            inference_config['inputs'] = batch
+            inference_config['tokens_to_generate'] = 1
+            inference_config['all_probs'] = True
+            inference_config["add_BOS"] = False
+            inference_config['greedy'] = True
+            response = generate(self, **inference_config)
+            response = get_computeprob_response(self.tokenizer, response, batch)
+        else:
+            # for megatron_gpt_eval.py
+            if isinstance(batch, list):
+                inference_config['inputs'] = batch
+            elif 'num_audios' in batch:
+                # peft_eval.py
+                inference_config['inputs'] = (
+                    batch['contexts'].cuda(),
+                    batch['context_lengths'].cuda(),
+                    batch['audio_signal'].cuda(),
+                    batch['audio_signal_length'].cuda(),
+                    batch['num_audios'].cuda(),
+                    batch['context_start_idx'],
+                )
+            else:
+                # peft_eval.py
+                inference_config['inputs'] = (
+                    batch['contexts'].cuda(),
+                    batch['context_lengths'].cuda(),
+                    batch['audio_signal'].cuda(),
+                    batch['audio_signal_length'].cuda(),
+                )
+            response = generate(self, **inference_config)
+
+        app_state = AppState()
+        _reconfigure_microbatch_calculator(
+            rank=app_state.global_rank,
+            rampup_batch_size=None,
+            global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+            micro_batch_size=global_batch_size_per_gpu // num_micro_batches_before_decode,
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        )
+
+        # add audio offsets to context lengths for properly decoding only the response
+        batch['context_lengths'] = batch['context_lengths'].cuda() + response['audio_feat_lens']
+
+        return response
+
+    def inference_epoch_end(self, outputs, mode, data_cfg):
+        # Parent class will handle logging of the loss.
+        if not outputs or (all([not x for x in outputs])):
+            return None
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        averaged_loss = []
+        averaged_metric = []
+        # Log metrics for each provided validation/test dataset.
+        for dataloader_idx, output in enumerate(outputs):
+            if len(output) == 0:
+                logging.warning(f"Empty output for dataloader_idx: {dataloader_idx}")
+                continue
+            # Expand on_validation_epoch_end from parent class MegatronGPTModel as on_validation_epoch_end doesnt take outputs arg
+            loss_vals = [x['loss'] for x in output]
+            if parallel_state.is_pipeline_last_stage():
+                # only the last pipeline parallel stages return loss with their batch size
+                if self.cfg.data.get('validation_drop_last', True):
+                    loss = torch.stack(loss_vals).mean()
+                else:
+                    # Compute the avg loss by total_loss across all samples / total number of samples
+                    total_loss_and_total_samples = torch.vstack(loss_vals).sum(axis=0)
+                    avg_loss = total_loss_and_total_samples[0] / total_loss_and_total_samples[1]
+                    loss = avg_loss.type(torch.float32).cuda()
+            else:
+                loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+
+            # we can only log on one rank if it is rank zero so we broadcast from last rank
+            torch.distributed.broadcast(loss, get_last_rank())
+
+            self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1, sync_dist=True)
+
+            # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index.
+            loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
+            self.log(loss_log_key, loss, batch_size=1)
+            averaged_loss.append(loss)
+
+            # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
+            gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            torch.distributed.all_gather_object(
+                gathered_outputs,
+                [
+                    {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
+                    for x in output
+                ],
+                group=parallel_state.get_data_parallel_group(),
+            )
+
+            # Remove duplicate examples due to distributed sampler.
+            inp_label_set = set()
+            deduplicated_outputs = {
+                'preds': [],
+                'labels': [],
+                'inputs': [],
+                'metadata': [],
+            }
+            total_size = 0
+            for rank in range(0, parallel_state.get_data_parallel_world_size()):
+                for batch in gathered_outputs[rank]:
+                    for pred, label, input, metadata in zip(
+                        batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
+                    ):
+                        key = input + label + str(metadata)
+                        total_size += 1
+                        if key not in inp_label_set:
+                            inp_label_set.add(key)
+                            deduplicated_outputs['preds'].append(pred)
+                            deduplicated_outputs['labels'].append(label)
+                            deduplicated_outputs['inputs'].append(input)
+                            deduplicated_outputs['metadata'].append(metadata)
+
+            # Compute metric score
+            metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+            metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
+            if metric_name != 'loss':
+                metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
+                metric_fn = self.val_metric[0] if mode == 'validation' else self.test_metric[0]
+                if metric_label_key in deduplicated_outputs['metadata'][0]:
+                    labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
+                else:
+                    labels = deduplicated_outputs['labels']
+
+                # sacrebleu.corpus_bleu is commonly used which does not share
+                # the same interface as other metrics. We handle it separately.
+                if metric_name == 'bleu':
+                    metric_result = torch.Tensor(
+                        [sacrebleu.corpus_bleu(deduplicated_outputs['preds'], [labels]).score]
+                    ).to(self.device)
+                else:
+                    for pred, label in zip(deduplicated_outputs['preds'], labels):
+                        _ = metric_fn(pred, label)
+
+                    metric_result = metric_fn.compute()
+
+                if metric_name == 'rouge':
+                    for k, v in metric_result.items():
+                        if 'fmeasure' in k:
+                            self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True, batch_size=1)
+                            logging.info(f"{mode} {metric_name} {k}: {v.item()}")
+                    metric_result = metric_result['rouge1_fmeasure']
+                else:
+                    self.log(metric_log_key, metric_result.item(), sync_dist=True, batch_size=1)
+                    logging.info(f"{mode} {metric_name}: {metric_result.item()}")
+
+                metric_fn.reset()
+                averaged_metric.append(metric_result)
+
+            # Write predictions to file
+            if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
+                logging.info(
+                    f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
+                )
+
+                # Check if the user provided a prefix path to the file(s) they want to write.
+                if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                    raise ValueError(
+                        f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                    )
+                filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+                output_dir = data_cfg.get("output_dir", "./")
+                self.write_predictions_to_file(
+                    deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}", output_dir
+                )
+
+            torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
+            outputs[dataloader_idx].clear()  # free memory
+
+        # Logging of the averaged metrics:
+        averaged_loss = sum(averaged_loss) / len(averaged_loss)
+        averaged_metric = sum(averaged_metric) / len(averaged_metric) if len(averaged_metric) > 0 else None
+        averaged_loss = averaged_loss.to(self.device)
+        if averaged_metric is not None:
+            averaged_metric = averaged_metric.to(self.device)
+
+        # Handle case where metrics can be nan or inf. This can break checkpoint save/load.
+        if averaged_metric is not None and (torch.isinf(averaged_metric) or torch.isnan(averaged_metric)):
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e5
+
+        if mode == 'validation':
+            self.log("validation_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"validation_{self.val_metric_name}", averaged_metric, sync_dist=True, batch_size=1)
+        elif mode == 'test':
+            self.log("test_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"test_{self.test_metric_name}", averaged_metric, sync_dist=True, batch_size=1)
+
+        # Merge the functionality of previous on_inference_epoch_end() within inference_epoch_end() func here
+        app_state = AppState()
+        self._restore_activation_checkpointing_args()
+        if hasattr(self, "_train_ds"):
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=self.cfg.data.train_ds.global_batch_size,
+                micro_batch_size=self.cfg.data.train_ds.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+        # When running `trainer.validate()`, the training dataset is not available.
+        else:
+            logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=data_cfg.global_batch_size,
+                micro_batch_size=data_cfg.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+
+        return averaged_loss, averaged_metric
+
+    # consistent with speech models
+    @rank_zero_only
+    def write_predictions_to_file(self, outputs, output_file_path_prefix, output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+        output_file_path = output_file_path_prefix + "_inputs_preds_labels.jsonl"
+        output_file_path = os.path.join(output_dir, output_file_path)
+        with open(output_file_path, "w") as f_json:
+            assert (
+                len(outputs['inputs']) == len(outputs['preds']) == len(outputs['labels']) == len(outputs['metadata'])
+            )
+            for i, p, l, m in zip(outputs['inputs'], outputs['preds'], outputs['labels'], outputs['metadata']):
+                json_string = {'input': i, 'pred_text': p, 'text': l}
+                for k, v in m.items():
+                    if k not in json_string:
+                        json_string[k] = v
+                f_json.write(json.dumps(json_string) + '\n')
+
+        logging.info(f'Predictions saved to {output_file_path}')
+
+    def setup_eval_dataloader(self, datasets, data_cfg):
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def setup_predict_dataloader(self, data_cfg):
+        datasets = self._build_dataset(data_cfg, False)
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_predict=True)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_predict=True)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def sharded_state_dict(self, prefix: str = ''):
+        """
+        Force None for the parent class's sharded_state_dict() method if setup is complete.
+        """
+        if self.setup_complete:
+            return None
+        else:
+            return super().sharded_state_dict(prefix=prefix)
+
+    def maybe_build_test(self):
+        # overwrite the parent class's maybe_build_test() method in MegatronGPTModel
+        if hasattr(self.cfg.data, 'test_ds'):
+            logging.info('Building test datasets...')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+        return
+
+    def maybe_setup_test(self):
+        # overwrite the parent class's maybe_build_test() method in MegatronGPTModel
+        if hasattr(self.cfg.data, 'test_ds'):
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+        return
+
+    def build_train_valid_test_datasets(self, stage):
+        if stage != 'test':
+            logging.info('Building validation datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
+
+        if stage != 'validate':
+            self.maybe_build_test()
+
+        if stage == 'validate' or stage == 'test':
+            return
+        logging.info('Building training datasets.')
+        self._train_ds = self._build_dataset(self.cfg.data.train_ds)
+
+    @classmethod
+    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
+        """
+        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
+
+        Returns:
+            List of available pre-trained models.
+        """
+        results = []
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="speechllm_fc_llama2_7b",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia/nemo/speechllm_fc_llama2_7b",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/speechllm_fc_llama2_7b/versions/1.23.1/files/speechllm_fc_llama2_7b.nemo",
+        )
+        results.append(model)
+        return results
+
+
+class CrossAttendModularAudioGPTModel(ModularAudioGPTModel):
+    """Modularized speech GPT model."""
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['tokens'],
+            audio_batch['tokens_length'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        num_audios = audio_batch.get("num_audios", None)
+        if num_audios is not None:
+            raise ValueError("num_audios is not supported.")
+
+        if self.cfg.get('megatron_amp_O2', False):
+            base_module = self.model.module
+        else:
+            base_module = self.model
+        lm_embedding = (
+            base_module.language_model.embedding if hasattr(base_module, 'language_model') else base_module.embedding
+        )
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        input_embeds = self._get_text_embeddings(input_ids, None).transpose(0, 1)
+        encoder_input, extra_outputs = self.perception_cross_attn(
+            encoded, encoded_len, input_embeds, input_lengths=input_length, return_mems=True
+        )
+        # TODO: need separate speech and text methods for inference
+        if 'audio_ratio' in audio_batch:
+            audio_ratio = audio_batch['audio_ratio'][..., None, None]
+            encoder_input = encoder_input * audio_ratio + input_embeds * (1 - audio_ratio)
+        if 'alpha_xattn' in extra_outputs:
+            alpha_xattn = extra_outputs['alpha_xattn']
+            self.log(
+                'alpha_xattn',
+                alpha_xattn.mean(),
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=True,
+            )
+        attention_mask = self._create_attention_mask(encoder_input)
+
+        if not hasattr(lm_embedding, 'transpose_batch_sequence') or lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.transpose(0, 1).contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, labels, loss_mask, (encoded, encoded_len, extra_outputs)
+
+    def setup_perception_modules(self, cfg):
+        super().setup_perception_modules(cfg)
+        imported_cls = model_utils.import_class_by_path(cfg.perception.xattn.target)
+        self.perception_cross_attn = imported_cls(cfg=cfg.perception)
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        if self.setup_complete:
+            return_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+            state_dict = self.perception_cross_attn.state_dict(prefix="perception_cross_attn.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        else:
+            return super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
new file mode 100644
index 000000000000..a96ee823e197
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
@@ -0,0 +1,1367 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+import json
+import os
+from functools import partial
+from typing import Any, Optional, Union
+
+import sacrebleu
+import torch
+from omegaconf import ListConfig
+from omegaconf.dictconfig import DictConfig
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.asr.models import ASRModel, SpeechEncDecSelfSupervisedModel
+from nemo.collections.asr.parts.mixins.transcription import move_to_device
+from nemo.collections.common.metrics import MetricStringToTorchMetric, TextMetricsSet
+from nemo.collections.multimodal.speech_llm.data.build_dataset import (
+    build_speechllm_dataloader,
+    build_speechllm_dataset,
+)
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+)
+from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel
+from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
+from nemo.collections.nlp.models.nlp_model import NLPModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    build_position_ids,
+    get_iterator_k_split,
+)
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes.mixins import adapter_mixins
+from nemo.utils import AppState, logging, model_utils
+
+try:
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator,
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+
+__all__ = ["ModularizedAudioT5Model"]
+
+
+default_inference_config = {'tokens_to_generate': 30}
+
+
+class ModularizedAudioT5Model(MegatronT5LoraModel):
+    """Modularized speech GPT model."""
+
+    def setup_perception_modules(self, cfg):
+        if 'target' in cfg.perception:
+            imported_cls = model_utils.import_class_by_path(cfg.perception.target)
+            self.perception = imported_cls(cfg=cfg.perception)
+        else:
+            self.perception = (
+                AudioPerceptionModule(cfg=cfg.perception)
+                if "encoders" not in cfg.perception
+                else MultiAudioPerceptionModule(cfg=cfg.perception)
+            )
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+        super().__init__(cfg, trainer)
+        self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds)
+        self.val_metric = torch.nn.ModuleList(self.val_metric)
+        if hasattr(self.cfg.data, "test_ds"):
+            self.test_metric, self.test_metric_name = self.setup_metric(self.cfg.data.test_ds)
+            self.test_metric = torch.nn.ModuleList(self.test_metric)
+        # Used other keys from metadata to calulate metrics
+        if hasattr(self.cfg.data, "test_ds") and hasattr(self.cfg.data.test_ds, "metric"):
+            self.test_metric_label_key = self.cfg.data.test_ds.metric.get('label_key', 'labels')
+        if hasattr(self.cfg.data, "validation_ds") and hasattr(self.cfg.data.validation_ds, "metric"):
+            self.val_metric_label_key = self.cfg.data.validation_ds.metric.get('label_key', 'labels')
+        self.setup_perception_modules(cfg)
+        self.setup_optimizer_param_groups()
+        # self.configure_optimizers()
+        self.summarize(max_depth=3)
+        # follow gpt
+        self.setup_complete = False
+        self.sep_id = cfg.get('sep_id', self.tokenizer.bos_id)
+        self.virtual_tokens = 0
+        self.model = self.frozen_model.enc_dec_model
+
+    def load_frozen_model(self, cfg, trainer):
+        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
+        t5_cfg_base = MegatronT5Model.restore_from(cfg.get('language_model_path'), trainer=trainer, return_config=True)
+        # use the incoming cfg updated by _modify_config
+        t5_cfg = copy.deepcopy(cfg)
+        t5_cfg.target = t5_cfg_base.target
+        self.frozen_model = MegatronT5Model.restore_from(
+            cfg.get('language_model_path'),
+            trainer=trainer,
+            override_config_path=t5_cfg,
+            save_restore_connector=NLPSaveRestoreConnector(),
+        )
+        logging.info(f"self.frozen_model.cfg: {self.frozen_model.cfg}")
+
+    def init_model(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+
+        self.load_frozen_model(cfg, trainer)
+        self.prompt_encoder = None
+        if self.frozen_model.tokenizer is not None:
+            self.tokenizer = self.frozen_model.tokenizer
+
+        if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
+            self.hidden_size = (
+                self.frozen_model.cfg.encoder.hidden_size
+            )  # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
+        else:
+            self.hidden_size = self.frozen_model.cfg.hidden_size
+
+        # Handle this when moving GPT prompt learning to the base class.
+        self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
+
+        self._reduced_loss_buffer = []
+        self._inference_config = None
+
+        self.tokenizer.legacy = cfg.get('legacy_tokenizer', False)
+        self.bos_id = self.tokenizer.bos_id
+        self.decoder_seq_length = cfg.get('decoder_seq_length', 40)
+
+        # make sure the default pytorch lightning gradient clipping in the basemodel
+        self.grad_clip_pl_default = False  # make distributed_fused_adam happy
+        self.lowest_val_loss = None
+        self.prompt_encoder = None
+
+        self.enable_autocast = (
+            True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
+        )
+
+    def parameters(self):
+        # override the same method in MegatronGPT model to include parameters ouside of LM
+        all_names = []
+        all_params = []
+        for name, param in self.named_parameters(recurse=True):
+            all_names.append(name)
+            all_params.append(param)
+
+        if isinstance(self.frozen_model, list):
+            for module in self.frozen_model:
+                for name, param in module.named_parameters(recurse=True):
+                    all_names.append(name)
+                    all_params.append(param)
+
+        return itertools.chain(all_params)
+
+    def setup_optimizer_param_groups(self):
+        """
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
+        Makes two optimizer param groups, one for the frozen model params
+        and one for the prompt-table/prompt-encoder params. The learning
+        rate for the frozen model's params will always be zero effectively
+        freezing the model's params but still allowing for the needed gradients
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
+        """
+        self.unfreeze()
+        known_groups = []
+        if self.cfg.get('freeze_llm', True):
+            for param in self.frozen_model.parameters():
+                param.requires_grad = False
+            known_groups.append('model.')
+        else:
+            if self.cfg.get('freeze_encoder', False):
+                for param in self.frozen_model.enc_dec_model.enc_dec_model.encoder.parameters():
+                    param.requires_grad = False
+                known_groups.append('enc_dec_model.encoder.')
+            if self.cfg.get('freeze_decoder', False):
+                for param in self.frozen_model.enc_dec_model.enc_dec_model.decoder.parameters():
+                    param.requires_grad = False
+                known_groups.append('enc_dec_model.decoder.')
+            if self.cfg.get('freeze_word_emb', False):
+                names = [
+                    'encoder_embedding',
+                    'encoder_relative_position_embedding',
+                    'decoder_relative_position_embedding',
+                    'decoder_embedding',
+                ]
+                for pname in names:
+                    for param in getattr(self.frozen_model.enc_dec_model, pname).parameters():
+                        param.requires_grad = False
+                known_groups.append('enc_dec_model.word_embeddings.')
+                known_groups.append('enc_dec_model.relative_position_embedding.')
+        if self.cfg.get('freeze_modality_adapter', False):
+            self.perception.modality_adapter.freeze()
+            known_groups.append('modality_adapter.')
+        if self.cfg.get('freeze_audio_encoder', False):
+            self.perception.encoder.freeze()
+            known_groups.append('audio_encoder.')
+
+        opt_params = []
+        for _, module in self.named_modules():
+            if isinstance(module, adapter_mixins.AdapterModuleMixin) and module.is_adapter_available():
+                module.set_enabled_adapters(enabled=True)
+                module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
+                opt_params += [p for p in module.parameters()]
+
+        param_groups = []
+        if "optim_param_groups" in self.cfg:
+            param_groups_cfg = self.cfg.optim_param_groups
+            for group, group_cfg in param_groups_cfg.items():
+                module = getattr(self, group, None)
+                if module is None:
+                    raise ValueError(f"{group} not found in model.")
+                elif hasattr(module, "parameters"):
+                    known_groups.append(f"{group}.")
+                    new_group = {"params": module.parameters()}
+                    for k, v in group_cfg.items():
+                        new_group[k] = v
+                    param_groups.append(new_group)
+                else:
+                    raise ValueError(f"{group} does not have parameters.")
+
+        for n, p in self.named_parameters():
+            is_unknown = True
+            for group in known_groups:
+                if n.startswith(group):
+                    is_unknown = False
+            if is_unknown:
+                opt_params.append(p)
+
+        param_groups = [{"params": opt_params}] + param_groups
+
+        self._optimizer_param_groups = param_groups
+        logging.info(f"Optimizer groups set:\n{self.summarize(max_depth=2)}")
+
+    def inject_perception_input(self, encoded, encoded_len, input_ids, input_length):
+        def _concat_embs(embs1, emb1_lens, embs2, emb2_lens):
+            concat_emb = []
+            concat_len = []
+            for emb1, emb1_len, emb2, emb2_len in zip(embs1, emb1_lens, embs2, emb2_lens):
+                if self.cfg.get('ignore_dummy_audio', False) and emb1_len <= 1:  # TODO: ignore the dummy audio emb
+                    new_len = emb2_len
+                    new_emb = emb2[:emb2_len]
+                else:
+                    new_len = emb1_len + emb2_len
+                    new_emb = torch.concat([emb1[:emb1_len], emb2[:emb2_len]], axis=0)
+                padded_new_emb = torch.zeros(emb1.shape[0] + emb2.shape[0], emb1.shape[-1], device=emb1.device)
+                padded_new_emb[:new_len, ...] = new_emb
+                concat_emb.append(padded_new_emb)
+                concat_len.append(new_len)
+            concat_emb = torch.stack(concat_emb, dim=0)
+            concat_len = torch.stack(concat_len, dim=0)
+            return concat_emb, concat_len
+
+        # [b, t, c]
+        lm_embedding = self.frozen_model.enc_dec_model.encoder_embedding
+        input_embeds = lm_embedding.word_embeddings(input_ids)
+        if self.cfg.audio_prompt_first:
+            encoder_input, encoder_length = _concat_embs(encoded, encoded_len, input_embeds, input_length)
+        else:  # more streaming friendly
+            encoder_input, encoder_length = _concat_embs(input_embeds, input_length, encoded, encoded_len)
+
+        b = encoder_input.shape[0]
+        max_len = encoder_input.shape[1]
+
+        # Using causal attention mask for whole input
+        # TODO(zhehuai): use prefixlm instead for the audio embeddings
+        attention_mask = torch.tril(torch.ones((b, max_len, max_len), device=encoder_input.device)).view(
+            b, 1, max_len, max_len
+        )
+        # Convert attention mask from float to bool
+        attention_mask = attention_mask < 0.5
+        position_ids = build_position_ids(encoder_input[:, :, 0])
+
+        # Add position embeddings
+        if hasattr(lm_embedding, "position_embeddings"):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            encoder_input = encoder_input + position_embeddings
+        else:
+            pass
+        encoder_max_length = encoder_input.shape[1]
+        if lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, encoder_length, position_ids, encoder_max_length
+
+    def _shift_labels_by_emb_len(self, labels, label_lens, emb_lens, max_len, pad_token=0):
+        shifted_labels = []
+        for label, label_len, emb_len in zip(labels, label_lens, emb_lens):
+            shifted_label = torch.full([max_len], pad_token, device=label.device)
+            shifted_label[emb_len : emb_len + label_len] = label[:label_len]
+            shifted_labels.append(shifted_label)
+        shifted_labels = torch.stack(shifted_labels, dim=0)
+        return shifted_labels
+
+    def _get_text_embeddings(self, text_tokens, position_ids):
+        lm_embedding = self.frozen_model.enc_dec_model.encoder_embedding
+        text_embeddings = lm_embedding.word_embeddings(text_tokens)  # (batch_size, seq_len, hidden_size)
+        if hasattr(lm_embedding, 'position_embeddings'):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            text_embeddings = text_embeddings + position_embeddings
+        return text_embeddings
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['contexts'],
+            audio_batch['context_lengths'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        encoder_input, attention_mask, encoder_length, _, encoder_max_length = self.inject_perception_input(
+            encoded, encoded_len, input_ids, input_length
+        )
+        # generate encoder_mask from encoder_length
+        enc_mask = torch.arange(encoder_input.shape[1], device=encoder_input.device)[None, :] < encoder_length[:, None]
+        return encoder_input, attention_mask, enc_mask
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """Forward pass of the model.
+
+        We prepend audio embeddings to the instruction and label text tokens
+        as the LLM input.
+        """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'audio_ratio', audio_batch['audio_ratio'].mean(), prog_bar=True, batch_size=1, rank_zero_only=False
+            )
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
+        encoder_input, attention_mask, enc_mask = self.prepare_llm_input(audio_batch)
+        # enc_input = speech and text prompt
+        # dec_input and label = text output label
+        b = audio_batch['answers'].shape[0]
+        device = audio_batch['answers'].device
+        dec_input = audio_batch['masked_answer_ids'] if 'masked_answer_ids' in audio_batch else audio_batch['answers']
+        dec_input = torch.cat([torch.full([b, 1], self.bos_id, device=device), dec_input[:, :-1]], dim=-1)
+        labels = audio_batch['answers']
+        dec_mask = (dec_input != self.tokenizer.pad_id).long().contiguous()
+        output = self.frozen_model.enc_dec_model(
+            enc_input_ids=None,
+            enc_attn_mask=enc_mask,
+            dec_input_ids=dec_input,
+            dec_attn_mask=dec_mask,
+            token_type_ids=None,
+            labels=labels,
+            output_enc_hidden_only=False,
+            enc_input=encoder_input,
+        )
+        loss_mask = dec_mask
+        return output, loss_mask
+
+    def get_forward_output_only_func(self):
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            extra_arg = {}
+            # take the batch produced by prepare_batch_at_step
+            (
+                _,
+                input_embeddings,
+                attention_mask,
+                _,
+                set_inference_key_value_memory,
+                inference_max_sequence_len,
+            ) = batch
+            if attention_mask is not None:
+                attention_mask = attention_mask.cuda()
+                attention_mask = attention_mask[0:1]
+            extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
+            extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
+            output_tensor = model(
+                input_ids=None,
+                position_ids=None,
+                encoder_input=input_embeddings,
+                attention_mask=attention_mask,
+                **extra_arg,
+            )
+
+            if isinstance(output_tensor, tuple):
+                output_tensor = output_tensor[1]  # get logits only
+
+            def id_func(output_tensor):
+                return output_tensor, {'logits': output_tensor}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def get_forward_output_and_loss_func(self, validation_step=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+            batch = next(dataloader_iter)
+            batch = {key: val.cuda(non_blocking=True) for key, val in batch.items()}
+            output_tensor, loss_mask = self.forward(
+                batch, checkpoint_activations_all_layers=checkpoint_activations_all_layers
+            )
+
+            def loss_func(output_tensor):
+                # Loss for a micro-batch (ub)
+                if 'audio_ratio' in batch:
+                    text_loss_weight = self.cfg.get('text_loss_weight', 1.0)
+                    audio_ratio = batch['audio_ratio']
+                    scaled_loss_mask = loss_mask * torch.unsqueeze(
+                        (1 * audio_ratio + text_loss_weight * (1 - audio_ratio)), 1
+                    )
+                    loss_for_ub = self.loss_func(scaled_loss_mask, output_tensor)
+                else:
+                    loss_for_ub = self.loss_func(loss_mask, output_tensor)
+                if validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    num_valid_tokens_in_ub = batch['loss_mask'].sum()
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                    else:
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        return build_speechllm_dataset(self, data_cfg, is_train)
+
+    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_eval=False):
+        return build_speechllm_dataloader(dataset, data_cfg, consumed_samples, is_eval=is_eval)
+
+    @classmethod
+    def _modify_config(cls, gpt_cfg, cfg, audio_cfg, add_cfg_to_tree=False):
+        """
+        This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
+        The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+        """
+        OmegaConf.set_struct(gpt_cfg, True)
+        OmegaConf.resolve(cfg)
+        with open_dict(gpt_cfg):
+            if 'vocab_file' in cfg.model:
+                gpt_cfg.tokenizer.vocab_file = cfg.model.vocab_file
+            gpt_cfg.legacy_tokenizer = cfg.model.get('legacy_tokenizer', False)
+            gpt_cfg.audio_prompt_first = cfg.model.get('audio_prompt_first', True)
+            gpt_cfg.ignore_dummy_audio = cfg.model.get('ignore_dummy_audio', False)
+            gpt_cfg.freeze_llm = cfg.model.get('freeze_llm', True)
+            gpt_cfg.freeze_word_emb = cfg.model.get('freeze_word_emb', False)
+            gpt_cfg.freeze_encoder = cfg.model.get('freeze_encoder', False)
+            gpt_cfg.freeze_decoder = cfg.model.get('freeze_decoder', False)
+            gpt_cfg.text_loss_weight = cfg.model.get('text_loss_weight', 1.0)
+            gpt_cfg.freeze_audio_encoder = cfg.model.get('freeze_audio_encoder', False)
+            gpt_cfg.freeze_modality_adapter = cfg.model.get('freeze_modality_adapter', False)
+            gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
+            gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
+            gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
+            gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
+            gpt_cfg.tensor_model_parallel_size = cfg.model.get(
+                "tensor_model_parallel_size",
+                gpt_cfg.tensor_model_parallel_size if hasattr(gpt_cfg, "tensor_model_parallel_size") else 1,
+            )
+            gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
+            gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
+            gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
+            gpt_cfg.data = cfg.model.data
+            gpt_cfg.optim = cfg.model.optim
+            gpt_cfg.precision = cfg.trainer.precision
+            gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
+            gpt_cfg.language_model_path = cfg.model.language_model_path
+            gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
+            gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
+            gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
+            # set dropout
+            hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
+            attention_dropout = cfg.model.get('attention_dropout', 0.0)
+            ffn_dropout = cfg.model.get('ffn_dropout', 0.0)
+            gpt_cfg.encoder.hidden_dropout = hidden_dropout
+            gpt_cfg.decoder.hidden_dropout = hidden_dropout
+            gpt_cfg.encoder.attention_dropout = attention_dropout
+            gpt_cfg.decoder.attention_dropout = attention_dropout
+            gpt_cfg.encoder.ffn_dropout = ffn_dropout
+            gpt_cfg.decoder.ffn_dropout = ffn_dropout
+            if hasattr(gpt_cfg, 'embedding_dropout'):
+                gpt_cfg.embedding_dropout = hidden_dropout
+            # set label_smoothing
+            if hasattr(gpt_cfg, 'label_smoothing'):
+                gpt_cfg.label_smoothing = cfg.model.get('label_smoothing', gpt_cfg.label_smoothing)
+            gpt_cfg.virtual_prompt_style = cfg.model.virtual_prompt_style
+            gpt_cfg.lora_tuning = cfg.model.lora_tuning
+            # for AudioGPTLoRAModel
+            gpt_cfg.target = f"{cls.__module__}.{cls.__name__}"
+            gpt_cfg.perception = cfg.model.perception
+            gpt_cfg.pretrained_audio_model = cfg.model.get('pretrained_audio_model', None)
+            gpt_cfg.perception.preprocessor = audio_cfg.preprocessor
+            gpt_cfg.perception.encoder = audio_cfg.encoder
+            modality_adapter_cfg = gpt_cfg.perception.modality_adapter
+            modality_adapter_cfg.feat_in = audio_cfg.encoder.d_model
+            gpt_cfg.perception.output_dim = gpt_cfg.encoder.hidden_size
+            override_vocab_size = cfg.model.get('override_vocab_size', None)
+            if override_vocab_size is not None:
+                gpt_cfg.override_vocab_size = override_vocab_size
+            if not hasattr(gpt_cfg, 'tokenizer'):
+                gpt_cfg.tokenizer = gpt_cfg.decoder_tokenizer
+            # This is needed when modifying a hparam file directly to load `.ckpt` files.
+            # This is not needed to modify the cfg in `.nemo` files.
+            if add_cfg_to_tree:
+                OmegaConf.resolve(gpt_cfg)
+                gpt_cfg.cfg = gpt_cfg
+
+        return gpt_cfg
+
+    @classmethod
+    def load_audio_model(cls, pretrained_audio_model):
+        try:
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = ASRModel.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = ASRModel.from_pretrained(pretrained_audio_model, map_location='cpu')
+        except:
+            logging.info(f'Fail in loading it with ASRModel. Try again with SpeechEncDecSelfSupervisedModel.')
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = SpeechEncDecSelfSupervisedModel.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = SpeechEncDecSelfSupervisedModel.from_pretrained(
+                    pretrained_audio_model, map_location='cpu'
+                )
+        return audio_model
+
+    @classmethod
+    def restore_from_pretrained_models(
+        cls,
+        cfg: Optional[Union[OmegaConf, str]] = None,
+        trainer: Optional[Trainer] = None,
+    ):
+        if not cfg.model.pretrained_audio_model:
+            raise RuntimeError("PEFT training needs a pretrained audio model present.")
+
+        if not cfg.model.language_model_path:
+            raise RuntimeError("PEFT training needs a trained base model present.")
+
+        base_model_save_restore_connector = NLPSaveRestoreConnector()
+        if os.path.isdir(cfg.model.language_model_path):
+            base_model_save_restore_connector.model_extracted_dir = cfg.model.language_model_path
+        base_model_cfg = cls.restore_from(
+            restore_path=cfg.model.language_model_path,
+            trainer=trainer,
+            return_config=True,
+            save_restore_connector=base_model_save_restore_connector,
+        )
+        audio_model = cls.load_audio_model(cfg.model.pretrained_audio_model)
+
+        model_cfg = cls._modify_config(base_model_cfg, cfg, audio_model.cfg, add_cfg_to_tree=False)
+
+        # load llm
+        model = cls.restore_from(
+            restore_path=cfg.model.language_model_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+        )
+        # load am
+        model.perception.tokenizer = audio_model.tokenizer
+        if cfg.model.get('load_audio_encoder', True):
+            model.perception.encoder.load_state_dict(
+                audio_model.encoder.state_dict(), strict='adapter' not in cfg.model.perception
+            )
+            logging.info(f'Loaded pretrained audio model from {cfg.model.pretrained_audio_model}')
+        else:
+            logging.info(f'Not load pretrained audio model from {cfg.model.pretrained_audio_model}')
+        if cfg.model.get('use_am_tokenizer', False):
+            model.tokenizer = audio_model.tokenizer
+            logging.info(f'Use AM tokenizer: {audio_model.tokenizer}')
+        if 'inference' in cfg:
+            inference_cfg = OmegaConf.to_container(cfg.inference, resolve=True)
+            model.set_inference_config(inference_cfg)
+        return model
+
+    def _build_vocab(self):
+        """
+        Manipulate vocabulary (e.g., pad vocabulary for increased performance)/
+        """
+        if self._cfg.get('override_vocab_size', None) is not None:
+            self.padded_vocab_size = self._cfg.override_vocab_size
+        else:
+            self.padded_vocab_size = self._vocab_size_with_padding(
+                orig_vocab_size=self.tokenizer.vocab_size,
+                make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128),
+                tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1),
+            )
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        if self.setup_complete:
+            # save adapter
+            return_state_dict = super().state_dict(destination, prefix, keep_vars)
+            # save perception
+            if not self.cfg.get('freeze_audio_encoder', False):
+                perception_state_dict = self.perception.state_dict(prefix="perception.")
+                return_state_dict.update(perception_state_dict)
+            # store llm if not freezing it
+            if not self.cfg.get('freeze_llm', True):
+                llm_state_dict = self.frozen_model.state_dict(prefix="frozen_model.")
+                return_state_dict.update(llm_state_dict)
+        else:
+            return_state_dict = self.frozen_model.state_dict(prefix="frozen_model.")
+        return return_state_dict
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        """
+        Loads a state_dict expecting the state_dict to contain key,values
+        only for the adapter parameters.
+        """
+        if self.setup_complete:
+            # load adapters
+            super().load_state_dict(state_dict, strict)
+            # load perception
+            print(f"loading state_dict {self.setup_complete}: {state_dict.keys()}")
+            super(NLPModel, self).load_state_dict(state_dict, strict=False)
+        else:
+            if len([i for i in state_dict.keys() if 'lora' in i]) > 0:
+                # load adapters
+                super().load_state_dict(state_dict, strict)
+            # load frozen llm and maybe perception model
+            print(f"loading state_dict {self.setup_complete}: {state_dict.keys()}")
+            super(NLPModel, self).load_state_dict(state_dict, strict=False)
+
+    def build_train_valid_test_datasets(self, stage):
+        if stage != 'test':
+            logging.info('Building GPT SFT validation datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
+
+        if stage != 'validate':
+            if hasattr(self.cfg.data, 'test_ds'):
+                logging.info('Building GPT SFT test datasets.')
+                # Wrap this in a list since the general finetuning parent class supports multi-validation.
+                self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+
+        if stage == 'validate' or stage == 'test':
+            return
+        logging.info('Building GPT SFT traing datasets.')
+        self._train_ds = self._build_dataset(self.cfg.data.train_ds)
+
+    def setup_training_data(self, training_data_config=None):
+        return
+
+    def setup_validation_data(self, validation_data_config=None):
+        return
+
+    def setup_test_data(self, test_data_config=None):
+        return
+
+    def setup_training_dataloader(self):
+        if hasattr(self, '_train_ds'):
+            consumed_samples = self.compute_consumed_samples(0)
+            self._train_dl = self.build_data_loader(
+                dataset=self._train_ds,
+                data_cfg=self.cfg.data.train_ds,
+                consumed_samples=consumed_samples,
+            )
+
+    def setup(self, stage=None):
+        self.init_consumed_samples = 0
+
+        if stage == 'predict':
+            return
+
+        # If the user wants to manually override train and validation dataloaders before calling `.fit()`
+        if self._train_dl is not None and self._validation_dl is not None:
+            return
+        self.build_train_valid_test_datasets(stage=stage)
+        if hasattr(self, '_train_ds'):
+            self.setup_training_dataloader()
+        if hasattr(self, '_validation_ds'):
+            self._validation_dl = self.setup_eval_dataloader(self._validation_ds, self.cfg.data.validation_ds)
+        if hasattr(self.cfg.data, 'test_ds'):
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+
+        # when using pipeline model parallel the final stage need to initialize word embeddings
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            if isinstance(self.frozen_model, list):
+                for i, module in enumerate(self.frozen_model):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    module.sync_initial_word_embeddings()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+            else:
+                self.frozen_model.sync_initial_word_embeddings()
+
+        if self.cfg.get('transformer_engine', False):
+            self.setup_transformer_engine_tp_groups()
+        self.setup_complete = True
+
+    @property
+    def _metrics_require_string2category_map(self):
+        return set(["f1", "accuracy", "average_precision"])
+
+    def setup_metric(self, data_cfg):
+        metric_name = "exact_string_match"
+        if not hasattr(data_cfg, "metric"):
+            metric = MetricStringToTorchMetric["exact_string_match"]
+        else:
+            if not hasattr(data_cfg.metric, "name"):
+                raise ValueError("Metric name is not provided in the metric config.")
+            if data_cfg.metric.name == "loss":
+                return None, "loss"
+            if data_cfg.metric.name not in MetricStringToTorchMetric:
+                raise KeyError(
+                    f"{data_cfg.metric.name} is not supported. List of supported metrics: {MetricStringToTorchMetric.keys()}"
+                )
+            if data_cfg.metric.name in self._metrics_require_string2category_map:
+                if data_cfg.metric.average is None:
+                    raise ValueError(
+                        f"{data_cfg.metric.name} requires specifying whether you want to compute a micro or macro average. Found None."
+                    )
+            if (
+                data_cfg.metric.get('labels_are_strings', False)
+                and data_cfg.metric.name in self._metrics_require_string2category_map
+            ):
+                if data_cfg.metric.num_classes is None:
+                    raise ValueError(
+                        "Number of classes is not provided in the metric section within the data config. "
+                        f"Please provide the number of classes in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if data_cfg.metric.get('class_labels', None) is None or not isinstance(
+                    data_cfg.metric.get('class_labels', None), ListConfig
+                ):
+                    raise ValueError(
+                        "Class labels are not provided properly in the metric section witnin the data config. "
+                        f"Please provide the class labels as a list of strings in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if len(data_cfg.metric.get('class_labels', None)) != data_cfg.metric.num_classes:
+                    raise ValueError(
+                        f"Number of class labels {len(data_cfg.metric.get('class_labels', None))} does not match `num_classes` : {data_cfg.metric.num_classes}"
+                    )
+
+            metric_name = data_cfg.metric.name
+            metric_cls = MetricStringToTorchMetric[metric_name]
+            if metric_name not in TextMetricsSet:
+                metric = [metric_cls(**data_cfg.metric)]
+            else:
+                metric = [metric_cls()]
+        return metric, metric_name
+
+    # Override the parent batch reconfiguring logic.
+    def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
+        global_batch_size_per_gpu = batch['tokens'].size(0)
+        # This should happen only on the last batch of the dataset.
+        if (
+            global_batch_size_per_gpu
+            != get_current_global_batch_size() // parallel_state.get_data_parallel_world_size()
+        ):
+            # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
+            if (
+                global_batch_size_per_gpu
+                != data_cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+            ):
+                app_state = AppState()
+                _reconfigure_microbatch_calculator(
+                    rank=app_state.global_rank,
+                    rampup_batch_size=None,
+                    global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+                    micro_batch_size=global_batch_size_per_gpu,
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                )
+            # NOTE: need to explicitly handle resetting for multi-validation
+            else:
+                app_state = AppState()
+                _reconfigure_microbatch_calculator(
+                    rank=app_state.global_rank,
+                    rampup_batch_size=None,
+                    global_batch_size=data_cfg.global_batch_size,
+                    micro_batch_size=data_cfg.micro_batch_size,
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                )
+
+    def validation_step(self, dataloader_iter, inference=False):
+        return self.inference_step(dataloader_iter, 'validation')
+
+    def _validation_step_internal(
+        self, dataloader_iter, batch_idx, dataloader_idx=0, inference=False, result_mode='validation'
+    ):
+        """
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        """
+        mode = self.training
+        self.eval()
+        loss = self.fwd_bwd_step(dataloader_iter, 0, True)
+        self.train(mode=mode)
+        self.frozen_model.eval()
+
+        if result_mode == 'validation':
+            if type(self._validation_dl) == list and len(self._validation_dl) > 1:
+                self.validation_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.validation_step_outputs.append(loss)
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.test_step_outputs.append(loss)
+        return loss
+
+    def inference_step(self, dataloader_iter, mode, dataloader_idx=0):
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
+        self._reconfigure_and_process_inference_batch(batch, data_cfg)
+        # Meta data from dataset
+        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
+        loss = self._validation_step_internal(itertools.chain([batch]), batch_idx, dataloader_idx, result_mode=mode)
+
+        # We need _inference_config to get generation params
+        # add_BOS and tokens_to_generate are set in dataset
+        if self.get_inference_config() is None:
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            self.set_inference_config(inference_config=default_inference_config)
+        self._inference_config['add_BOS'] = data_cfg.add_bos
+        self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate')
+
+        output = self.predict_step(batch, batch_idx, dataloader_idx)
+
+        inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
+        labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
+        preds_text = output['preds_text']
+        if data_cfg.get("log_every_n_steps", None) is not None:
+            if batch_idx % data_cfg.log_every_n_steps == 0:
+                logging.info(f"Input: `{inputs_text[0]}`")
+                logging.info(f"Label: `{labels_text[0]}`")
+                logging.info(f"Pred: `{preds_text[0]}`")
+
+        outputs = {
+            'loss': loss,
+            'preds': preds_text,  # [str]
+            'labels': labels_text,  # [str]
+            'inputs': inputs_text,  # [str]
+            'metadata': metadata,  # [dict]
+        }
+
+        if mode == 'validation':
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[-1] = outputs
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                self.test_step_outputs[-1] = outputs
+        return outputs
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+
+        batch = move_to_device(batch, device=self.device)
+        encoder_input, attention_mask, enc_mask = self.prepare_llm_input(batch)
+        # enc_input = speech and text prompt
+        # dec_input and label = text output label
+        predicted_token_ids, log_probs = self.frozen_model.decode(
+            tokens_enc=None,
+            enc_mask=enc_mask,
+            num_tokens_to_generate=self._inference_config['tokens_to_generate'],
+            encoder_input=encoder_input,
+            tokenizer=self.tokenizer,
+            bos_id=self.bos_id,
+        )
+
+        # Special ids to text function to handle stripping <eos> and special tokens with sentencepiece tokenizers.
+        input_text = batch['contexts']
+        preds_text = MegatronT5SFTModel.ids_to_text(predicted_token_ids, self.tokenizer)
+        input_text = MegatronT5SFTModel.ids_to_text(input_text, self.tokenizer)
+        labels = batch['answers']
+
+        if labels is not None:
+            labels_text = MegatronT5SFTModel.ids_to_text(labels, self.tokenizer)
+        else:
+            labels_text = [None] * len(preds_text)
+
+        return {
+            'input_text': input_text,
+            'preds_text': preds_text,
+            'labels_text': labels_text,
+        }
+
+    def on_test_epoch_end(self):
+        _ = self.inference_epoch_end(self.test_step_outputs, 'test', self.cfg.data.test_ds)
+        # Commenting as on_test_epoch_end was a no-op in PTL 1.9
+        # return super().on_test_epoch_end()
+
+    def on_validation_epoch_end(self):
+        _ = self.inference_epoch_end(self.validation_step_outputs, 'validation', self.cfg.data.validation_ds)
+        # Commenting as on_validation_epoch_end was a no-op in PTL 1.9
+        # return super().on_validation_epoch_end()
+
+    def inference_epoch_end(self, outputs, mode, data_cfg):
+        # Parent class will handle logging of the loss.
+        if not outputs:
+            # Handle case where no metrics. This can break checkpoint save/load.
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e2
+            logging.warning(f"No outputs to log for {mode} epoch")
+            return torch.Tensor([1e2]), torch.Tensor([averaged_metric])
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        averaged_loss = []
+        averaged_metric = []
+        # Log metrics for each provided validation/test dataset.
+        for dataloader_idx, output in enumerate(outputs):
+            if len(output) == 0:
+                logging.warning(f"Empty output for dataloader_idx: {dataloader_idx}")
+                continue
+            # Expand on_validation_epoch_end from parent class MegatronGPTModel as on_validation_epoch_end doesnt take outputs arg
+            loss_vals = [x['loss'] for x in output]
+            if parallel_state.is_pipeline_last_stage():
+                # only the last pipeline parallel stages return loss with their batch size
+                if self.cfg.data.get('validation_drop_last', True):
+                    loss = torch.stack(loss_vals).mean()
+                else:
+                    # Compute the avg loss by total_loss across all samples / total number of samples
+                    total_loss_and_total_samples = torch.vstack(loss_vals).sum(axis=0)
+                    avg_loss = total_loss_and_total_samples[0] / total_loss_and_total_samples[1]
+                    loss = avg_loss.type(torch.float32).cuda()
+            else:
+                loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+
+            # we can only log on one rank if it is rank zero so we broadcast from last rank
+            torch.distributed.broadcast(loss, get_last_rank())
+
+            self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1, sync_dist=True)
+
+            # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index.
+            loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
+            self.log(loss_log_key, loss, batch_size=1)
+            averaged_loss.append(loss)
+
+            # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
+            gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            torch.distributed.all_gather_object(
+                gathered_outputs,
+                [
+                    {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
+                    for x in output
+                ],
+                group=parallel_state.get_data_parallel_group(),
+            )
+
+            # Remove duplicate examples due to distributed sampler.
+            inp_label_set = set()
+            deduplicated_outputs = {
+                'preds': [],
+                'labels': [],
+                'inputs': [],
+                'metadata': [],
+            }
+            total_size = 0
+            for rank in range(0, parallel_state.get_data_parallel_world_size()):
+                for batch in gathered_outputs[rank]:
+                    for pred, label, input, metadata in zip(
+                        batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
+                    ):
+                        key = input + label
+                        total_size += 1
+                        dedup = data_cfg.get('deduplicate', True)
+                        if (not dedup) or key not in inp_label_set:
+                            inp_label_set.add(key)
+                            deduplicated_outputs['preds'].append(pred)
+                            deduplicated_outputs['labels'].append(label)
+                            deduplicated_outputs['inputs'].append(input)
+                            deduplicated_outputs['metadata'].append(metadata)
+
+            # Compute metric score
+            metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+            metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
+            if metric_name != 'loss':
+                metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
+                metric_fn = self.val_metric[0] if mode == 'validation' else self.test_metric[0]
+                if metric_label_key in deduplicated_outputs['metadata'][0]:
+                    labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
+                else:
+                    labels = deduplicated_outputs['labels']
+
+                # sacrebleu.corpus_bleu is commonly used which does not share
+                # the same interface as other metrics. We handle it separately.
+                if metric_name == 'bleu':
+                    metric_result = torch.Tensor(
+                        [sacrebleu.corpus_bleu(deduplicated_outputs['preds'], [labels]).score]
+                    ).to(self.device)
+                else:
+                    for pred, label in zip(deduplicated_outputs['preds'], labels):
+                        _ = metric_fn(pred, label)
+
+                    metric_result = metric_fn.compute()
+
+                if metric_name == 'rouge':
+                    for k, v in metric_result.items():
+                        if 'fmeasure' in k:
+                            self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True)
+                            logging.info(f"{mode} {metric_name} {k}: {v.item()}")
+                    metric_result = metric_result['rouge1_fmeasure']
+                else:
+                    self.log(metric_log_key, metric_result.item(), sync_dist=True)
+                    logging.info(f"{mode} {metric_name}: {metric_result.item()}")
+
+                metric_fn.reset()
+                averaged_metric.append(metric_result)
+
+            # Write predictions to file
+            if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
+                logging.info(
+                    f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
+                )
+
+                # Check if the user provided a prefix path to the file(s) they want to write.
+                if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                    raise ValueError(
+                        f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                    )
+                filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+                output_dir = data_cfg.get("output_dir", "./")
+                self.write_predictions_to_file(
+                    deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}", output_dir
+                )
+
+            torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
+            outputs[dataloader_idx].clear()  # free memory
+
+        # Logging of the averaged metrics:
+        averaged_loss = sum(averaged_loss) / len(averaged_loss)
+        averaged_metric = sum(averaged_metric) / len(averaged_metric) if len(averaged_metric) > 0 else None
+
+        # Handle case where metrics can be nan or inf. This can break checkpoint save/load.
+        if averaged_metric is not None and (torch.isinf(averaged_metric) or torch.isnan(averaged_metric)):
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e5
+
+        if mode == 'validation':
+            self.log("validation_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"validation_{self.val_metric_name}", averaged_metric, sync_dist=True)
+        elif mode == 'test':
+            self.log("test_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"test_{self.test_metric_name}", averaged_metric, sync_dist=True)
+
+        # Merge the functionality of previous on_inference_epoch_end() within inference_epoch_end() func here
+        app_state = AppState()
+        # TODO(zhehuai): add _restore_sequence_parallelism_args after sync to HEAD
+        if hasattr(self, "_train_ds"):
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=self.cfg.data.train_ds.global_batch_size,
+                micro_batch_size=self.cfg.data.train_ds.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+        # When running `trainer.validate()`, the training dataset is not available.
+        else:
+            logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=data_cfg.global_batch_size,
+                micro_batch_size=data_cfg.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+
+        return averaged_loss, averaged_metric
+
+    # consistent with speech models
+    def write_predictions_to_file(self, outputs, output_file_path_prefix, output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+        output_file_path = output_file_path_prefix + "_inputs_preds_labels.jsonl"
+        output_file_path = os.path.join(output_dir, output_file_path)
+        with open(output_file_path, "w") as f_json:
+            assert (
+                len(outputs['inputs']) == len(outputs['preds']) == len(outputs['labels']) == len(outputs['metadata'])
+            )
+            for i, p, l, m in zip(outputs['inputs'], outputs['preds'], outputs['labels'], outputs['metadata']):
+                json_string = {'input': i, 'pred_text': p, 'text': l}
+                for k, v in m.items():
+                    if k not in json_string:
+                        json_string[k] = v
+                f_json.write(json.dumps(json_string) + '\n')
+
+        logging.info(f'Predictions saved to {output_file_path}')
+
+    def setup_eval_dataloader(self, datasets, data_cfg):
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+        batch = next(dataloader_iter)
+        # Pass only torch.Tensor to prevent errors when process get_iterator_k_split()
+        batch = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        _, seq_length = batch['tokens'].shape
+        # handle the case where the batch size from dynamic bucketting is not divisible in lhotse
+        data_iter = get_iterator_k_split(batch, get_num_microbatches(), enforce_divisible_batch=False)
+
+        # handle asynchronous grad reduction
+        no_sync_func = None
+        grad_sync_func = None
+        param_sync_func = None
+        if not forward_only and self.with_distributed_adam:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
+            grad_sync_func = self.reduce_overlap_gradients
+            param_sync_func = self.sync_overlap_parameters
+
+        self.model.config.no_sync_func = no_sync_func
+        self.model.config.grad_sync_func = grad_sync_func
+        self.model.config.param_sync_func = param_sync_func
+
+        fwd_bwd_function = get_forward_backward_func()
+
+        dec_seq_length = batch['answers'].shape[1]
+
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(),
+            data_iterator=data_iter,
+            model=[self.model],
+            num_microbatches=get_num_microbatches(),
+            forward_only=forward_only,
+            seq_length=seq_length,
+            micro_batch_size=get_micro_batch_size(),
+            decoder_seq_length=dec_seq_length,
+        )
+
+        # only the last stages of the pipeline return losses
+        if losses_reduced_per_micro_batch:
+            if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                # average loss across micro batches
+                loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                loss_tensor = torch.concat(loss_tensors_list)
+                loss_mean = loss_tensor.mean()
+            else:
+                # Get the total loss since micro batches sizes are not uniform
+                loss_sum_tensors_list = [
+                    loss_sum['loss_sum_and_ub_size']
+                    for loss_sum in losses_reduced_per_micro_batch
+                    if loss_sum['loss_sum_and_ub_size'][1] > 0
+                ]
+                loss_sum = (
+                    torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                    if len(loss_sum_tensors_list) > 0
+                    else torch.tensor([0.0, 0.0]).cuda()
+                )
+                return loss_sum
+        else:
+            # we're not on the last pipeline stage so no losses
+            if forward_only:
+                loss_mean = []
+            else:
+                loss_mean = torch.tensor(0.0).cuda()
+
+        return loss_mean
+
+    def loss_func(self, loss_mask, output_tensor):
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()  # sequence level nll
+        return loss
+
+    def _determine_log_key(self, data_config, dataloader_idx, metric_name, mode):
+        # Function that determines whether to log based on the user provided name of the dataset or the dataloader index.
+        base_key = f"{mode}_{metric_name}_" if metric_name is not None else f"{mode}_"
+        # If the user provided names for each validation/test dataset, use those.
+        if hasattr(data_config, "names") and data_config.names is not None:
+            # With only a single validation/test dataset, the name is not a list.
+            if not isinstance(data_config.names, ListConfig):
+                name = data_config.names
+            else:
+                name = data_config.names[dataloader_idx]
+            return base_key + name
+        else:
+            return base_key + f"dataloader{dataloader_idx}"
+
+    def test_step(self, dataloader_iter, dataloader_idx=0):
+        return self.inference_step(dataloader_iter, 'test')
+
+    def training_step(self, dataloader_iter):
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        return super().training_step(itertools.chain([batch]), batch_idx=batch_idx)
+
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel called by configure_ddp in nlp_overrides."""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            raise ValueError("T5 does not support both distributed adam and mcore distributed data parallel.")
+
+
+class DecoderTextPromptModularizedAudioT5Model(ModularizedAudioT5Model):
+    """Modularized speech GPT model."""
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        encoder_input, attention_mask, encoder_length = encoded, None, encoded_len
+        # generate encoder_mask from encoder_length
+        enc_mask = torch.arange(encoder_input.shape[1], device=encoder_input.device)[None, :] < encoder_length[:, None]
+        return encoder_input, attention_mask, enc_mask
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """Forward pass of the model.
+
+        We prepend audio embeddings to the instruction and label text tokens
+        as the LLM input.
+        """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
+        encoder_input, _, enc_mask = self.prepare_llm_input(audio_batch)
+        # enc_input = speech prompt
+        # dec_input and label = text prompt and text output label
+        dec_input = audio_batch['tokens']
+        labels = audio_batch['labels']
+        dec_mask = (dec_input != self.tokenizer.eos_id) * (dec_input != self.tokenizer.pad_id).long().contiguous()
+        output = self.frozen_model.enc_dec_model(
+            enc_input_ids=None,
+            enc_attn_mask=enc_mask,
+            dec_input_ids=dec_input,
+            dec_attn_mask=dec_mask,
+            token_type_ids=None,
+            labels=labels,
+            output_enc_hidden_only=False,
+            enc_input=encoder_input,
+        )
+        loss_mask = audio_batch['loss_mask']
+        return output, loss_mask
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+
+        batch = move_to_device(batch, device=self.device)
+        encoder_input, _, enc_mask = self.prepare_llm_input(batch)
+        # enc_input = speech prompt
+        # dec_input and label = text prompt and text output label
+
+        predicted_token_ids, log_probs = self.frozen_model.decode(
+            tokens_enc=None,
+            enc_mask=enc_mask,
+            num_tokens_to_generate=self._inference_config['tokens_to_generate'],
+            encoder_input=encoder_input,
+            tokenizer=self.tokenizer,
+            bos_id=self.bos_id,
+            predicted_tokens_dec=torch.cat(
+                [
+                    batch['contexts'],
+                    torch.full_like(batch['contexts'][:, :1], self.sep_id, device=batch['contexts'].device),
+                ],
+                dim=1,
+            ),
+        )
+        predicted_token_ids = predicted_token_ids[:, batch['contexts'].shape[1] + 1 :]
+
+        # Special ids to text function to handle stripping <eos> and special tokens with sentencepiece tokenizers.
+        input_text = batch['contexts']
+        preds_text = MegatronT5SFTModel.ids_to_text(predicted_token_ids, self.tokenizer)
+        input_text = MegatronT5SFTModel.ids_to_text(input_text, self.tokenizer)
+        labels = batch['answers']
+
+        if labels is not None:
+            labels_text = MegatronT5SFTModel.ids_to_text(labels, self.tokenizer)
+        else:
+            labels_text = [None] * len(preds_text)
+
+        return {
+            'input_text': input_text,
+            'preds_text': preds_text,
+            'labels_text': labels_text,
+        }
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        # this is crucial so as to tell the decoder when to start generate answer after context and paddings
+        assert data_cfg.add_sep == True
+        return super()._build_dataset(data_cfg, is_train)
diff --git a/nemo/collections/multimodal/speech_llm/modules/__init__.py b/nemo/collections/multimodal/speech_llm/modules/__init__.py
new file mode 100644
index 000000000000..d9562652ce84
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.multimodal.speech_llm.modules.modality_adapters import PoolingMLPConnectors
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+    MultiFeatureAggregator,
+)
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
new file mode 100644
index 000000000000..763e03b699cd
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import torch
+
+import nemo.collections.nlp.modules.common.text_generation_strategy as text_generation_strategy
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
+from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
+
+# the text representation of eos_id, it applies for all tokenizers
+END_OF_SEQ = '<|endoftext|>'
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    boolean = boolean.unsqueeze(0).unsqueeze(-1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+class AudioToTextGenerationStrategy(text_generation_strategy.GPTModelTextGenerationStrategy):
+    def init_batch(
+        self,
+        context_tokens: torch.Tensor,
+        context_lengths: torch.Tensor,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        compute_attention_mask: bool,
+        num_audios: Optional[torch.Tensor] = None,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+
+        audio_feats, audio_feat_lens = self.model.perception(
+            input_signal=audio_signal,
+            input_signal_length=audio_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+
+        if num_audios is not None:
+            # handle multiple audio files per sample
+            audio_feats = audio_feats.split(num_audios.tolist())
+            audio_feat_lens = audio_feat_lens.split(num_audios.tolist())
+
+        encoder_input, attention_mask, _, position_ids, encoder_max_length = self.model.inject_perception_input(
+            audio_feats, audio_feat_lens, context_tokens, context_lengths, context_start_idx
+        )
+
+        self.attention_mask = attention_mask
+        self.position_ids = position_ids
+
+        if num_audios is not None:
+            # handle multiple audio files per sample
+            new_context_tokens = shift_tokens_by_multi_audios(
+                context_tokens, context_lengths, audio_feat_lens, context_start_idx, encoder_max_length
+            )
+            audio_feat_lens = torch.stack([torch.sum(lens) for lens in audio_feat_lens])  # [batch,]
+        else:
+            new_context_tokens = self.model._shift_labels_by_emb_len(
+                context_tokens, context_lengths, audio_feat_lens, encoder_max_length, pad_token=0
+            )
+
+        return new_context_tokens, encoder_input, audio_feat_lens
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """clip the max len based on the LM model max sequence length"""
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        input_embeddings: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_lengths: torch.Tensor,
+        curr_context_length: int,
+        compute_attention_mask: bool,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        # types2use = None
+        if step == 0:
+            # Allocate memory for the entire context.
+            set_inference_key_value_memory = True
+            tokens2use = tokens[:, :curr_context_length]
+            positions2use = self.position_ids[:, :curr_context_length]
+            embeddings2use = input_embeddings[:curr_context_length]
+        else:
+            # Set this to false so the memory is not reallocated.
+            set_inference_key_value_memory = False
+            tokens2use = tokens[:, curr_context_length - 1].view(micro_batch_size, -1)
+            positions2use = self.position_ids[:, curr_context_length - 1].view(micro_batch_size, -1)
+            embeddings2use = self.model._get_text_embeddings(tokens2use, positions2use)
+            started = context_lengths <= curr_context_length
+            embeddings2use = switch(input_embeddings[curr_context_length - 1].unsqueeze(0), embeddings2use, started)
+
+        """Prepare batch for each of the inference steps"""
+        setkey_value_array = torch.tensor(
+            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
+        )
+        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
+
+        batch = [tokens2use, embeddings2use, self.attention_mask, positions2use, setkey_value_array, len_array]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, tensor_shape
+
+    def post_process(self, tokens: torch.Tensor, new_tokens: torch.Tensor, context_length: int):
+        """
+        At the end of the inference, post process the inference results
+        """
+        pass
+
+    def end_of_generation_condition(
+        self, tokens: torch.Tensor, prev: torch.Tensor, eod_id: int, end_strings: List[str]
+    ) -> torch.Tensor:
+        """
+        return whether the generation should stop based on the previous token
+        Args:
+            tokens (torch.Tensor): the generated tokens so far
+            prev  (torch.Tensor): the previous token
+            eod_id (int): the end of document token id
+            end_strings (List[str]): the list of end of generation strings
+        returns:
+            a boolean tensor indicating whether the generation should stop
+        """
+        if len(end_strings) == 1 and end_strings[0] == END_OF_SEQ:
+            return prev == eod_id
+        else:
+            tokenizer = self.model.tokenizer
+            conditions = []
+            end_tokens = set()
+            end_tokens.add(eod_id)
+            for end_string in end_strings:
+                if len(end_string) > 1:
+                    continue
+                ids_1 = tokenizer.text_to_ids(f'<extra_id_1>{end_string}')
+                ids_2 = tokenizer.text_to_ids('<extra_id_1>')
+                if len(ids_1) <= len(ids_2):
+                    continue
+                token_id = ids_1[len(ids_2) :][0]
+
+                end_tokens.add(token_id)
+
+            for p, token_item in zip(prev, tokens):
+                text = tokenizer.ids_to_text(token_item.tolist())
+                conditions.append(
+                    any([text.endswith(end_string) for end_string in end_strings] + [p.item() in end_tokens])
+                )
+            return torch.tensor(conditions, dtype=torch.bool, device=tokens.device)
+
+
+class CrossAttendAudioToTextGenerationStrategy(AudioToTextGenerationStrategy):
+    def init_batch(
+        self,
+        context_tokens: torch.Tensor,
+        context_lengths: torch.Tensor,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        compute_attention_mask: bool,
+        num_audios: Optional[torch.Tensor] = None,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+        batch = {
+            'audio_signal': audio_signal,
+            'audio_signal_length': audio_length,
+            'tokens': context_tokens,
+            'tokens_length': context_lengths,
+            'labels': context_tokens,
+            'loss_mask': None,
+        }
+        if self.model.perception.cfg.get('combine_return', True):
+            (
+                encoder_input,
+                self.attention_mask,
+                context_tokens,
+                _,
+                (speech_encoded, speech_encoded_len, extra_outputs),
+            ) = self.model.prepare_llm_input(batch)
+            self.position_ids = build_position_ids(encoder_input[:, :, 0].transpose(0, 1))
+            self.extra_outputs = extra_outputs
+            return (
+                context_tokens,
+                (encoder_input, speech_encoded, speech_encoded_len),
+                torch.zeros_like(context_lengths),
+            )
+        else:
+            (
+                encoder_input,
+                self.attention_mask,
+                context_tokens,
+                _,
+                (speech_encoded, speech_encoded_len, llm_encoded_len, extra_outputs),
+            ) = self.model.prepare_llm_input(batch)
+            self.position_ids = build_position_ids(encoder_input[:, :, 0].transpose(0, 1))
+            self.extra_outputs = extra_outputs
+            return context_tokens, (encoder_input, speech_encoded, speech_encoded_len), llm_encoded_len
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        input_embeddings: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_lengths: torch.Tensor,
+        curr_context_length: int,
+        compute_attention_mask: bool,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        # types2use = None
+        self.input_embeds_hidden = self.extra_outputs.get('input_embeds_hidden', None)
+        input_embeddings, speech_encoded, speech_encoded_len = input_embeddings
+        if step == 0:
+            # Allocate memory for the entire context.
+            set_inference_key_value_memory = True
+            tokens2use = tokens[:, :curr_context_length]
+            positions2use = self.position_ids[:, :curr_context_length]
+            embeddings2use = input_embeddings[:curr_context_length]
+        else:
+            # Set this to false so the memory is not reallocated.
+            set_inference_key_value_memory = False
+            tokens2use = tokens[:, curr_context_length - 1].view(micro_batch_size, -1)
+            positions2use = self.position_ids[:, curr_context_length - 1].view(micro_batch_size, -1)
+            embeddings2use = self.model._get_text_embeddings(tokens2use, positions2use).transpose(0, 1)
+            started = context_lengths <= curr_context_length
+            # for seq started, first get embeddings2use, and then run cross attend, after that replace embeddings2use with the cross attended embed
+            # use speech_encoded; rerun cross attend
+            # [1, b, d]
+            decoder_mems_list = self.extra_outputs.get('decoder_mems_list', None)
+            if decoder_mems_list is not None:
+                decoder_mems_list = decoder_mems_list[:, :, : curr_context_length - 1]
+            # need to use audio_ratio field if to support text-only decoding
+            embeddings2use, self.extra_outputs = self.model.perception_cross_attn(
+                speech_encoded,
+                speech_encoded_len,
+                embeddings2use,
+                input_lengths=tokens2use.squeeze(-1) != self.model.tokenizer.eos_id,
+                decoder_mems_list=decoder_mems_list,
+                return_mems=True,
+            )
+            self.input_embeds_hidden = self.extra_outputs.get('input_embeds_hidden', None)
+            embeddings2use = switch(
+                input_embeddings[curr_context_length - 1].unsqueeze(0), embeddings2use.transpose(0, 1), started
+            )
+
+        """Prepare batch for each of the inference steps"""
+        setkey_value_array = torch.tensor(
+            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
+        )
+        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
+
+        batch = [tokens2use, embeddings2use, self.attention_mask, positions2use, setkey_value_array, len_array]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, tensor_shape
+
+
+def model_inference_strategy_dispatcher(model, **args):
+    from nemo.collections.multimodal.speech_llm.models.modular_models import (
+        CrossAttendModularAudioGPTModel,
+        ModularAudioGPTModel,
+    )
+
+    if isinstance(model, CrossAttendModularAudioGPTModel):
+        return CrossAttendAudioToTextGenerationStrategy(model, **args)
+    elif isinstance(model, ModularAudioGPTModel):
+        return AudioToTextGenerationStrategy(model, **args)
+    else:
+        return text_generation_strategy.model_inference_strategy_dispatcher(model, **args)
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
new file mode 100644
index 000000000000..136418031586
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
@@ -0,0 +1,698 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for generating text."""
+
+import pickle
+from collections.abc import Iterable
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+import nemo.collections.nlp.modules.common.text_generation_utils as text_generation_utils
+from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
+from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_strategy import (
+    model_inference_strategy_dispatcher,
+)
+from nemo.collections.nlp.modules.common.transformer.text_generation import OutputType
+from nemo.utils import AppState
+
+try:
+    from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+__all__ = [
+    "get_computeprob_response",
+    "generate",
+]
+
+
+def get_computeprob_response(tokenizer, response, inputs):
+    return text_generation_utils.get_computeprob_response(tokenizer, response, inputs)
+
+
+def send_generate_info(
+    context_tokens_tensor,
+    context_length_tensor,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs,
+    compute_logprob,
+    temperature,
+    top_k,
+    top_p,
+    greedy,
+    repetition_penalty,
+    min_tokens_to_generate,
+    end_strings,
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    """
+    Needs to be synced up with receive_generate_info
+    """
+    model_parallel_group = parallel_state.get_model_parallel_group()
+    src = text_generation_utils.get_model_parallel_src_rank()
+
+    audio_max_len = audio_signal.size(1) if audio_signal is not None else 0
+
+    # Send the sizes of the tensors
+    input_info = [
+        context_tokens_tensor.size(0),  # batch_size
+        context_tokens_tensor.size(1),  # seq_len
+        audio_max_len,  # audio_max_len
+        tokens_to_generate,
+        all_probs,
+        compute_logprob,  # whether to compute log probabilities matrix
+        temperature,
+        top_k,
+        top_p,
+        greedy,
+        repetition_penalty,
+        min_tokens_to_generate,
+    ]
+    input_info_tensor = torch.cuda.FloatTensor(input_info)
+    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
+
+    # Send variables to all ranks
+    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
+    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
+
+    torch.distributed.broadcast(audio_signal, src, model_parallel_group)
+    torch.distributed.broadcast(audio_signal_length, src, model_parallel_group)
+
+    # send end strings
+    string_tensor = torch.as_tensor(
+        np.frombuffer(pickle.dumps(end_strings), dtype=np.int8), device=torch.cuda.current_device()
+    )
+    size = torch.as_tensor([string_tensor.size(0)], device=torch.cuda.current_device(), dtype=torch.int64)
+    torch.distributed.broadcast(size, src, model_parallel_group)
+    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
+
+    if num_audios is not None:
+        torch.distributed.broadcast(num_audios, src, model_parallel_group)
+
+    if context_start_idx is not None:
+        context_idx_tensor = torch.as_tensor(
+            np.frombuffer(pickle.dumps(context_start_idx), dtype=np.int8), device=torch.cuda.current_device()
+        )
+        ctx_size = torch.as_tensor([context_idx_tensor.size(0)], device=torch.cuda.current_device(), dtype=torch.int64)
+        torch.distributed.broadcast(ctx_size, src, model_parallel_group)
+        torch.distributed.broadcast(context_idx_tensor, src, model_parallel_group)
+
+
+def receive_generate_info(has_multi_audios=False):
+    """
+    Needs to be synced up with send_generate_info
+    """
+    model_parallel_group = parallel_state.get_model_parallel_group()
+    src = text_generation_utils.get_model_parallel_src_rank()
+    input_info_tensor = torch.empty(12, dtype=torch.float32, device=torch.cuda.current_device())
+    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
+    batch_size = int(input_info_tensor[0].item())
+    seq_len = int(input_info_tensor[1].item())
+    audio_len = int(input_info_tensor[2].item())
+    tokens_to_generate = int(input_info_tensor[3].item())
+    all_probs = bool(input_info_tensor[4].item())
+    compute_logprob = bool(input_info_tensor[5].item())  # whether to compute log probabilities matrix
+    temperature = float(input_info_tensor[6].item())
+    top_k = int(input_info_tensor[7].item())
+    top_p = float(input_info_tensor[8].item())
+    greedy = bool(input_info_tensor[9].item())
+    repetition_penalty = float(input_info_tensor[10].item())
+    min_tokens_to_generate = int(input_info_tensor[11].item())
+
+    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
+    # Send variables to all ranks
+    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
+    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
+
+    audio_signal = torch.empty(batch_size, audio_len, dtype=torch.float32, device=torch.cuda.current_device())
+    audio_signal_length = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+    # Send variables to all ranks
+    torch.distributed.broadcast(audio_signal, src, model_parallel_group)
+    torch.distributed.broadcast(audio_signal_length, src, model_parallel_group)
+
+    array_size = torch.empty(1, dtype=torch.int64, device=torch.cuda.current_device())
+    torch.distributed.broadcast(array_size, src, model_parallel_group)
+
+    string_tensor = torch.empty(array_size[0], dtype=torch.int8, device=torch.cuda.current_device())
+    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
+    bytes = string_tensor.cpu().numpy().tobytes()
+    end_strings = pickle.loads(bytes)
+
+    num_audios = None
+    context_start_idx = None
+    if has_multi_audios:
+        num_audios = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+        torch.distributed.broadcast(num_audios, src, model_parallel_group)
+
+        array_size = torch.empty(1, dtype=torch.int64, device=torch.cuda.current_device())
+        torch.distributed.broadcast(array_size, src, model_parallel_group)
+        context_idx_tensor = torch.empty(array_size[0], dtype=torch.int8, device=torch.cuda.current_device())
+        torch.distributed.broadcast(context_idx_tensor, src, model_parallel_group)
+        bytes = context_idx_tensor.cpu().numpy().tobytes()
+        context_start_idx = pickle.loads(bytes)
+
+    return (
+        context_length_tensor,
+        context_tokens_tensor,
+        audio_signal,
+        audio_signal_length,
+        tokens_to_generate,
+        all_probs,
+        compute_logprob,
+        temperature,
+        top_k,
+        top_p,
+        greedy,
+        repetition_penalty,
+        min_tokens_to_generate,
+        end_strings,
+        num_audios,
+        context_start_idx,
+    )
+
+
+def synced_generate(
+    model,
+    inference_strategy,
+    context_tokens_tensor,
+    context_length_tensor,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs,
+    temperature,
+    top_k=0,
+    top_p=0.0,
+    greedy=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    repetition_penalty=1.2,
+    end_strings=[],
+    min_tokens_to_generate=0,
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    context_length = context_length_tensor.min().item()
+    tokenizer = model.tokenizer
+    if isinstance(tokenizer, TabularTokenizer):
+        raise NotImplementedError("Tabular generation is not supported yet")
+    else:
+        batch_token_iterator = sample_sequence_batch(
+            model,
+            inference_strategy,
+            context_tokens_tensor,
+            context_length_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_attention_mask=compute_attention_mask,
+            compute_logprob=compute_logprob,
+            temperature=temperature,
+            end_strings=end_strings,
+            extra={
+                "top_p": top_p,
+                "top_k": top_k,
+                "greedy": greedy,
+                "repetition_penalty": repetition_penalty,
+                "min_tokens_to_generate": min_tokens_to_generate,
+            },
+            num_audios=num_audios,
+            context_start_idx=context_start_idx,
+        )
+
+    for tokens, lengths, output_logits, full_logits, audio_feat_lens in batch_token_iterator:
+        context_length += 1
+    context_length += audio_feat_lens.min().item()
+    if parallel_state.is_pipeline_last_stage():
+        src = parallel_state.get_pipeline_model_parallel_last_rank()
+        group = parallel_state.get_embedding_group()
+        if compute_logprob:
+            torch.distributed.broadcast(output_logits, src, group)
+        if all_probs:
+            src = parallel_state.get_pipeline_model_parallel_last_rank()
+            group = parallel_state.get_embedding_group()
+            torch.distributed.broadcast(full_logits, src, group)
+
+    else:
+        if parallel_state.is_pipeline_first_stage():
+            src = parallel_state.get_pipeline_model_parallel_last_rank()
+            group = parallel_state.get_embedding_group()
+
+            if compute_logprob:
+                precision = model._trainer.precision
+                if precision in [16, "16"]:
+                    dtype = torch.float16
+                elif precision == "bf16":
+                    dtype = torch.bfloat16
+                else:
+                    dtype = torch.float32
+                output_logits = torch.empty(
+                    tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda")
+                )
+                torch.distributed.broadcast(output_logits, src, group)
+
+            if all_probs:
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_embedding_group()
+                full_logits = torch.empty(
+                    tokens.size(0),
+                    context_length - 1,
+                    model.padded_vocab_size,
+                    dtype=dtype,
+                    device=torch.device("cuda"),
+                )
+                torch.distributed.broadcast(full_logits, src, group)
+    if tokens is not None:
+        return tokens[:, :context_length], output_logits, full_logits, audio_feat_lens
+    return None
+
+
+def generate(
+    model,
+    inputs: Union[Tuple, List[str]],
+    tokens_to_generate=0,
+    all_probs=False,
+    temperature=1.0,
+    add_BOS=False,
+    top_k=0,
+    top_p=0.0,
+    greedy=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    repetition_penalty=1.0,
+    end_strings=['<|endoftext|>'],
+    min_tokens_to_generate=0,
+    **strategy_args,
+) -> OutputType:
+    """
+    Args:
+        model (NLPModel): text generative model
+        inputs (Union[tuple, List[str]]): if it is a tuple, it is assumed to be (context_tokens_tensor, context_length_tensor). Otherwise it it a list of prompt text strings
+        tokens_to_generate (int): The maximum length of the tokens to be generated.
+        all_probs (bool): Return the log prob for all the tokens
+        temperature (float): sampling temperature
+        add_BOS (bool): add the bos token at the begining of the prompt
+        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        top_p (float): If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+        greedy (bool):  Whether or not to use sampling ; use greedy decoding otherwise
+        repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty
+        min_tokens_to_generate (int): The minimum length of the tokens to be generated
+        strategy_args, the extra arguments are treated as inference strategy arguments
+        end_strings, a list of strings to stop generation when they are encountered in the output.
+    Returns:
+        OutputType: It generates the output in a dictionary type. It has the following keys:
+            sentences: List[str], output sentences
+            tokens: List[List[str]], output sentences borken into tokens
+            logprob: List[Tensor], log prob of generated tokens
+            full_logprob: List[Tensor], log prob of all the tokens in the vocab
+            token_ids: List[Tensor], output sentence token ids
+            offsets: List[List[int]]  # list of tokens start positions in text
+    """
+    if 'strategy' in strategy_args:
+        inference_strategy = strategy_args['strategy']
+    else:
+        inference_strategy = model_inference_strategy_dispatcher(model)
+    tokenizer = model.tokenizer
+    has_multi_audios = False
+    num_audios = None
+    context_start_idx = None
+    audio_signal, audio_signal_length = None, None
+    if torch.distributed.get_rank() == text_generation_utils.get_model_parallel_src_rank():
+        if isinstance(inputs, tuple) and len(inputs) == 2:
+            context_tokens_tensor, context_length_tensor = inputs
+        elif isinstance(inputs, tuple) and len(inputs) == 4:
+            context_tokens_tensor, context_length_tensor, audio_signal, audio_signal_length = inputs
+        elif isinstance(inputs, tuple) and len(inputs) == 6:  # multi-audio
+            has_multi_audios = True
+            (
+                context_tokens_tensor,
+                context_length_tensor,
+                audio_signal,
+                audio_signal_length,
+                num_audios,
+                context_start_idx,
+            ) = inputs
+        else:
+            context_tokens_tensor, context_length_tensor = inference_strategy.tokenize_batch(
+                inputs, tokens_to_generate, add_BOS
+            )
+
+        send_generate_info(
+            context_tokens_tensor,
+            context_length_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_logprob,
+            temperature,
+            top_k,
+            top_p,
+            greedy,
+            repetition_penalty,
+            min_tokens_to_generate,
+            end_strings,
+            num_audios,
+            context_start_idx,
+        )
+    else:
+        (
+            context_length_tensor,
+            context_tokens_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_logprob,
+            temperature,
+            top_k,
+            top_p,
+            greedy,
+            repetition_penalty,
+            min_tokens_to_generate,
+            end_strings,
+            num_audios,
+            context_start_idx,
+        ) = receive_generate_info(has_multi_audios)
+
+    output = synced_generate(
+        model,
+        inference_strategy,
+        context_tokens_tensor,
+        context_length_tensor,
+        audio_signal,
+        audio_signal_length,
+        tokens_to_generate,
+        all_probs,
+        temperature,
+        compute_attention_mask=compute_attention_mask,
+        compute_logprob=compute_logprob,
+        top_k=top_k,
+        top_p=top_p,
+        greedy=greedy,
+        repetition_penalty=repetition_penalty,
+        end_strings=end_strings,
+        min_tokens_to_generate=min_tokens_to_generate,
+        num_audios=num_audios,
+        context_start_idx=context_start_idx,
+    )
+    special_tokens = set()
+    if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None:
+        special_tokens.add(tokenizer.pad_token)
+    if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None:
+        special_tokens.add(tokenizer.eos_token)
+    if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None:
+        special_tokens.add(tokenizer.bos_token)
+    if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None:
+        special_tokens.add(tokenizer.cls_token)
+    if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None:
+        special_tokens.add(tokenizer.unk_token)
+    if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None:
+        special_tokens.add(tokenizer.sep_token)
+    if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None:
+        special_tokens.add(tokenizer.mask_token)
+    if output is not None:
+        decode_tokens, output_logits, full_logits, audio_feat_lens = output
+        resp_sentences = []
+        resp_sentences_seg = []
+
+        decode_tokens = decode_tokens.cpu().numpy().tolist()
+        for decode_token in decode_tokens:
+            sentence = tokenizer.ids_to_text(decode_token)
+            resp_sentences.append(sentence)
+            if not isinstance(tokenizer, TabularTokenizer):
+                words = []
+                for token in decode_token:
+                    if not isinstance(token, Iterable):
+                        token = [token]
+                    word = tokenizer.ids_to_tokens(token)
+                    if isinstance(word, Iterable):
+                        word = word[0]
+                    if hasattr(tokenizer.tokenizer, 'byte_decoder'):
+                        word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                            'utf-8', errors='replace'
+                        )
+                    words.append(word)
+                resp_sentences_seg.append(words)
+            else:
+                words = tokenizer.text_to_tokens(sentence)
+                resp_sentences_seg.append(words)
+
+        # offsets calculation
+        all_offsets = []
+        for item in resp_sentences_seg:
+            offsets = [0]
+            for index, token in enumerate(item):
+                if index != len(item) - 1:
+                    if token in special_tokens:
+                        offsets.append(offsets[-1])
+                    else:
+                        offsets.append(len(token) + offsets[-1])
+            all_offsets.append(offsets)
+
+        output = {}
+        output['sentences'] = resp_sentences
+        output['tokens'] = resp_sentences_seg
+        output['logprob'] = output_logits
+        output['full_logprob'] = full_logits
+        output['token_ids'] = decode_tokens
+        output['offsets'] = all_offsets
+        output['audio_feat_lens'] = audio_feat_lens
+        output = inference_strategy.post_generation_process(output)
+        return output
+    return None
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+def sample_sequence_batch(
+    model,
+    inference_strategy,
+    context_tokens,
+    context_lengths,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    type_ids=None,
+    temperature=None,
+    end_strings=['<|endoftext|>'],
+    extra={},
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    app_state = AppState()
+    micro_batch_size = context_tokens.shape[0]
+    _reconfigure_microbatch_calculator(
+        rank=app_state.global_rank,
+        rampup_batch_size=None,
+        global_batch_size=micro_batch_size,
+        micro_batch_size=micro_batch_size,
+        data_parallel_size=1,
+    )
+    assert tokens_to_generate > 0, "tokens_to_generate should be > 0"
+    assert (
+        model.cfg.get('sequence_parallel', False) == False
+    ), 'sequence_parallel should be False during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+    assert (
+        model.cfg.get('activations_checkpoint_granularity', None) is None
+    ), 'activations_checkpoint_granularity should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+    assert (
+        model.cfg.get('activations_checkpoint_method', None) is None
+    ), 'activations_checkpoint_method should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+
+    tokenizer = model.tokenizer
+    # initialize the batch
+    with torch.no_grad():
+        context_tokens, input_embeddings, audio_feat_lens = inference_strategy.init_batch(
+            context_tokens,
+            context_lengths,
+            audio_signal,
+            audio_signal_length,
+            compute_attention_mask,
+            num_audios,
+            context_start_idx,
+        )
+        audio_text_context_lengths = context_lengths + audio_feat_lens
+        context_length = audio_text_context_lengths.min().item()
+        # added eos_id to support the function generate_samples_eval that passes
+        # eos_id as an argument and needs termination when that id id found.
+        eod_id = tokenizer.eos_id
+        counter = 0
+        batch_size = context_tokens.size(0)
+        is_done = torch.zeros([batch_size]).byte().cuda()
+        tokens = context_tokens
+        output_logits = None
+        all_generated_indices = None  # used to track all generated indices
+        # Generate enough tokens for the longest sequence
+        maxlen = tokens_to_generate + audio_text_context_lengths.max().item()
+        maxlen = inference_strategy.clip_max_len(maxlen)
+        lengths = torch.ones([batch_size]).long().cuda() * maxlen
+        while context_length < maxlen:
+            batch, tensor_shape = inference_strategy.prepare_batch_at_step(
+                tokens,
+                input_embeddings,
+                maxlen,
+                micro_batch_size,
+                counter,
+                audio_text_context_lengths,
+                context_length,
+                compute_attention_mask,
+            )
+            output = inference_strategy.forward_step(batch, tensor_shape)
+            if parallel_state.is_pipeline_last_stage():
+                if compute_logprob:
+                    output = output[0]['logits']
+                    output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
+                    assert output is not None
+                    logits = output[:, -1].view(batch_size, -1).contiguous()
+
+                else:
+                    logits = output[0]['logits'][:, -1].contiguous()
+                    logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+                    assert logits is not None
+                    logits = logits.view(batch_size, -1)
+
+                # make sure it will generate at least min_length
+                min_length = extra.get('min_tokens_to_generate', 0)
+                if min_length > 0:
+                    within_min_length = (context_length - audio_text_context_lengths) < min_length
+                    logits[within_min_length, eod_id] = -float('Inf')
+                # make sure it won't sample outside the vocab_size range
+                logits[:, tokenizer.vocab_size :] = -float('Inf')
+
+                # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens
+                started = audio_text_context_lengths <= context_length
+                if extra.get('greedy', False):
+                    prev = torch.argmax(logits, dim=-1).view(-1)
+                else:
+                    logits = logits.float()
+                    logits /= temperature
+                    # handle repetition penality
+                    logits = text_generation_utils.repetition_penalty(
+                        logits, extra.get('repetition_penalty', 1.2), all_generated_indices
+                    )
+                    logits = text_generation_utils.top_k_logits(
+                        logits, top_k=extra.get('top_k', 0), top_p=extra.get('top_p', 0.9), started=started
+                    )
+                    probs = F.softmax(logits, dim=-1)
+                    # TODO(zhehuai)
+                    probs = probs.nan_to_num(1.0)
+                    prev = torch.multinomial(probs, num_samples=1).view(-1)
+
+                # Clamp the predicted out of vocabulary tokens
+                prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
+                new_tokens = switch(tokens[:, context_length].view(-1), prev, started)
+
+                # Replace sampled tokens w/ done token if EOD has already been sampled
+                new_tokens = switch(new_tokens, eod_id, is_done)
+
+                # post process the inference tokens based on the strategy
+                inference_strategy.post_process(tokens, new_tokens, context_length)
+
+                # Insert either new predicted or next prompt token
+                tokens[:, context_length] = new_tokens
+
+                if compute_logprob:
+                    if output_logits is None:
+                        output = F.log_softmax(output[:, :context_length, :], 2)
+
+                        indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2)
+                        output_logits = torch.gather(output, 2, indices).squeeze(2)
+                        all_generated_indices = indices[:, :, 0]
+                        if all_probs:
+                            full_logits = output
+                    else:
+                        output = F.log_softmax(output, 2)
+                        indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2)
+                        new_output_logits = torch.gather(output, 2, indices).squeeze(2)
+
+                        # TODO(rprenger) we're copying output_logits every time.  Should pre-allocate
+                        output_logits = torch.cat([output_logits, new_output_logits], 1)
+                        all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1)
+                        if all_probs:
+                            full_logits = torch.cat([full_logits, output], 1)
+
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_embedding_group()
+                torch.distributed.broadcast(new_tokens, src, group)
+
+                #                done_token = (prev == eod_id).byte() & started.byte()
+                done_token = inference_strategy.end_of_generation_condition(
+                    tokens[:, : context_length + 1], prev, eod_id, end_strings
+                )
+                done_token = done_token.byte() & started.byte()
+
+                just_finished = (done_token & ~is_done).bool()
+                lengths[just_finished.view(-1)] = context_length
+                is_done = is_done | done_token
+
+                done = torch.all(is_done)
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_pipeline_model_parallel_group()
+                torch.distributed.broadcast(done, src, group)
+                if compute_logprob:
+                    if all_probs:
+                        yield tokens, lengths, output_logits, full_logits, audio_feat_lens
+                    else:
+                        yield tokens, lengths, output_logits, None, audio_feat_lens
+                else:
+                    yield tokens, lengths, None, None, audio_feat_lens
+
+            else:
+                if parallel_state.is_pipeline_first_stage():
+                    src = parallel_state.get_pipeline_model_parallel_last_rank()
+                    group = parallel_state.get_embedding_group()
+                    new_tokens = torch.empty_like(tokens[:, context_length])
+                    torch.distributed.broadcast(new_tokens, src, group)
+                    tokens[:, context_length] = new_tokens
+                    yield tokens, None, None, None, audio_feat_lens
+                else:
+                    yield None, None, None, None, audio_feat_lens
+
+                done = torch.cuda.ByteTensor([0])
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_pipeline_model_parallel_group()
+                torch.distributed.broadcast(done, src, group)
+
+            context_length += 1
+            counter += 1
+            if done:
+                break
diff --git a/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
new file mode 100644
index 000000000000..9138845c73bd
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from nemo.collections.common.parts.multi_layer_perceptron import MultiLayerPerceptron as MLP
+from nemo.core.classes.common import typecheck
+from nemo.core.classes.exportable import Exportable
+from nemo.core.classes.mixins import AccessMixin
+from nemo.core.classes.module import NeuralModule
+from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, NeuralType
+
+__all__ = ['PoolingMLPConnectors']
+
+
+class ConcatPooling(nn.Module):
+    """
+    A module that perform pooling by concatenating the features of every pooling_factor frames.
+    """
+
+    def __init__(self, pooling_factor):
+        super().__init__()
+        self.pooling_factor = pooling_factor
+
+    def forward(self, x):
+        # x: [batch_size, seq_len, input_dim]
+        batch_size, seq_len, input_dim = x.shape
+        if seq_len % self.pooling_factor != 0:
+            x = x[:, : -(seq_len % self.pooling_factor), :]
+        x = x.reshape(batch_size, seq_len // self.pooling_factor, input_dim * self.pooling_factor)
+        return x
+
+
+class PoolingMLPConnectors(NeuralModule, Exportable, AccessMixin):
+    """
+    A module that performs pooling and MLP on the input features.
+    Currently only supports mean pooling and concatenation pooling.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim=None,
+        num_layers: int = 2,
+        activation: str = "relu",
+        pooling: str = "mean",
+        pooling_factor: int = 2,
+        **kwargs,  # keep this to avoid breaking existing code
+    ):
+        """
+        Args:
+            input_dim: input dimension of the features
+            hidden_dim: hidden dimension of the MLP layers
+            output_dim: output dimension of the features
+            num_layers: number of layers in the MLP
+            activation: activation function used in MLP
+            pooling: type of pooling, currently only supports "mean" and "cat"
+            pooling_factor: size of the pooling window
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim if output_dim else input_dim
+        self.num_layers = num_layers
+        self.activation = activation
+        self.pooling = pooling
+        self.pooling_factor = pooling_factor
+
+        if num_layers == 1:
+            self.hidden_dim = output_dim
+
+        if pooling == "cat":
+            self.preprocess = nn.Sequential(
+                ConcatPooling(pooling_factor), nn.Linear(input_dim * pooling_factor, self.hidden_dim)
+            )
+        else:
+            self.preprocess = nn.Sequential(
+                nn.AvgPool1d(pooling_factor, stride=pooling_factor), nn.Linear(input_dim, self.hidden_dim)
+            )
+
+        if num_layers == 1:
+            self.mlp = nn.Identity()
+        else:
+            self.mlp = MLP(self.hidden_dim, output_dim, num_layers, activation, log_softmax=False)
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        return OrderedDict(
+            {
+                "audio_signal": NeuralType(("B", "D", "T"), AcousticEncodedRepresentation()),
+                "length": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return OrderedDict(
+            {
+                "outputs": NeuralType(("B", "D", "T"), AcousticEncodedRepresentation()),
+                "outputs_len": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @typecheck()
+    def forward(self, audio_signal, length=None):
+        """
+        Args:
+            audio_signal: [batch_size, input_dim, seq_len]
+            length: [batch_size]
+        Returns:
+            outputs: [batch_size, output_dim, seq_len//pooling_factor]
+            outputs_len: [batch_size]
+        """
+        outputs = self.preprocess(audio_signal.transpose(1, 2))
+        outputs = self.mlp(outputs)
+        outputs_len = torch.div(length, self.pooling_factor, rounding_mode='floor')
+        return outputs.transpose(1, 2), outputs_len
+
+
+class IdentityConnectors(NeuralModule, Exportable, AccessMixin):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+
+    def forward(self, audio_signal, length=None, *args, **kwargs):
+        return audio_signal, length
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
new file mode 100644
index 000000000000..a42c7d06cba0
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -0,0 +1,505 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from typing import List, Optional, Tuple
+
+import torch
+import torch.distributed
+import torch.nn as nn
+from omegaconf import DictConfig
+
+from nemo.collections.asr.models import EncDecSpeakerLabelModel
+from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerMultiLayerFeatureExtractor
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
+from nemo.collections.nlp.modules.common.transformer.transformer_decoders import TransformerDecoder
+from nemo.core.classes import Exportable, NeuralModule
+from nemo.core.classes.common import typecheck
+from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
+from nemo.utils.decorators import experimental
+
+__all__ = ["AudioPerceptionModule", "MultiAudioPerceptionModule"]
+
+
+class AudioPerceptionModule(NeuralModule, Exportable):
+    """Audio perception module that consists of audio encoder(s) and modality adapter."""
+
+    def input_example(self, max_batch: int = 8, max_dim: int = 32000, min_length: int = 200):
+        batch_size = torch.randint(low=1, high=max_batch, size=[1]).item()
+        max_length = torch.randint(low=min_length, high=max_dim, size=[1]).item()
+        signals = torch.rand(size=[batch_size, max_length]) * 2 - 1
+        lengths = torch.randint(low=min_length, high=max_dim, size=[batch_size])
+        lengths[0] = max_length
+        return signals, lengths, None, None
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        return OrderedDict(
+            {
+                "input_signal": NeuralType(("B", "T"), AudioSignal(freq=self.preprocessor._sample_rate)),
+                "input_signal_length": NeuralType(
+                    tuple("B"), LengthsType()
+                ),  # Please note that length should be in samples not seconds.
+                "processed_signal": NeuralType(("B", "D", "T"), SpectrogramType()),
+                "processed_signal_length": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return OrderedDict(
+            {
+                "encoded": NeuralType(("B", "T", "D"), AcousticEncodedRepresentation()),
+                "encoded_len": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        # Initialize components
+        self.cfg = cfg
+        self.preprocessor = self.from_config_dict(cfg.preprocessor)
+        self.encoder = self.from_config_dict(cfg.encoder)
+
+        if cfg.get("use_multi_layer_feat", False) and cfg.get("multi_layer_feat", None):
+            if "_target_" in cfg.multi_layer_feat.aggregator:
+                aggregator = self.from_config_dict(cfg.multi_layer_feat.aggregator)
+            else:
+                aggregator = MultiFeatureAggregator(cfg.multi_layer_feat.aggregator, channel_dim=1)
+            self.encoder = ConformerMultiLayerFeatureExtractor(
+                encoder=self.encoder, layer_idx_list=cfg.multi_layer_feat.layer_idx_list, aggregator=aggregator
+            )
+
+        if 'spec_augment' in cfg and cfg.spec_augment is not None:
+            self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+        self.modality_adapter = self.from_config_dict(cfg.modality_adapter)
+        if 'output_dim' not in cfg.modality_adapter and "d_model" in cfg.modality_adapter:  # e.g., conformer encoder
+            self.proj = nn.Linear(cfg.modality_adapter.d_model, cfg.output_dim)
+        else:
+            self.proj = nn.Identity()
+
+    def maybe_preprocess_audio(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        return processed_signal, processed_signal_length
+
+    # disable type checks to avoid type-check errors when using Conformer as modality adapter
+    @typecheck.disable_checks()
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        processed_signal, processed_signal_length = self.maybe_preprocess_audio(
+            input_signal, input_signal_length, processed_signal, processed_signal_length
+        )
+
+        # Spec augment is not applied during evaluation/testing
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+        encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len)
+
+        # b, c, t -> b, t, c
+        encoded = self.proj(encoded.transpose(1, 2))
+
+        return encoded, encoded_len
+
+
+class MultiFeatureAggregator(nn.Module):
+    """
+    A module used to aggregate multiple encoded features (from different encoders or different layers) into a single feature sequence.
+    """
+
+    def __init__(self, cfg: DictConfig, channel_dim: int = 1):
+        super().__init__()
+        self.mode = cfg.get("mode", "cat")
+        self.channel_dim = channel_dim
+        self.pooling = cfg.get("pooling", "mean")
+        self.align_mode = cfg.get("align_mode", "min")
+
+    def _have_same_length(self, encoded_len: List[torch.Tensor]) -> bool:
+        sample_len = encoded_len[0]
+        for x in encoded_len:
+            if torch.sum(x - sample_len) != 0:
+                return False
+        return True
+
+    def forward(
+        self,
+        encoded: List[torch.Tensor],
+        encoded_len: List[torch.Tensor],
+        ref_idx: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self._have_same_length(encoded_len):
+            """Align the length of encoded features if they are different."""
+            target_len = encoded[0].size(self.channel_dim)
+            if ref_idx is not None:
+                target_len = encoded[ref_idx].size(self.channel_dim)
+            if self.channel_dim != 1:
+                encoded = [x.transpose(1, self.channel_dim) for x in encoded]
+            encoded, encoded_len = align_feat_seq_list(
+                encoded, encoded_len, mode=self.align_mode, pooling=self.pooling, target_len=target_len
+            )
+            if self.channel_dim != 1:
+                encoded = [x.transpose(1, self.channel_dim) for x in encoded]
+
+        if self.mode == "cat":
+            return torch.cat(encoded, dim=self.channel_dim), encoded_len[0]
+        elif self.mode == "sum":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).sum(dim=-1), encoded_len[0]
+        elif self.mode == "mean" or self.mode == "avg":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).mean(dim=-1), encoded_len[0]
+        elif self.mode == "max":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).max(dim=-1), encoded_len[0]
+        elif self.mode == "min":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).min(dim=-1), encoded_len[0]
+        elif self.mode == "none":
+            return encoded, encoded_len
+        else:
+            raise ValueError(f"Unknown mode {self.mode}")
+
+
+@experimental
+class MultiAudioPerceptionModule(NeuralModule, Exportable):
+    """
+    Audio perception module that consists of multiple audio encoders and shared modality adapter.
+    This module is experimental. An example perception cfg is:
+    -------------------
+    perception:
+        modality_adapter:
+            _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors
+            hidden_dim: 512
+            pooling: 'cat'
+            pooling_factor: 2
+            num_layers: 4
+            input_dim: -1
+            output_dim: -1
+
+        spec_augment:
+            _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+            freq_masks: 2 # set to zero to disable it
+            time_masks: 10 # set to zero to disable it
+            freq_width: 27
+            time_width: 0.05
+
+        encoders:
+            asr_model:
+                _target_: nemo.collections.asr.models.ASRModel
+                output_key: d_model
+                freeze: True
+                pretrained_model: stt_en_fastconformer_transducer_large
+            ssl_model:
+                _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
+                output_key: d_model
+                freeze: True
+                pretrained_model: ssl_en_conformer_large
+                use_multi_layer_feat: True
+                multi_layer_feat:
+                layer_idx_list: [0,16]
+                aggregator:
+                    mode: "cat"
+                    pooling: "avg"
+                    rounding: "floor"
+
+            speaker_model:
+                segment_length_in_secs: 0.4
+                freeze: True
+                pretrained_model: titanet_large
+
+            ref_model: asr_model
+            aggregator:
+                mode: "cat"
+                pooling: "mean"
+                rounding: "floor"
+    -------------------
+    """
+
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        # Initialize components
+        self.aggregator = MultiFeatureAggregator(cfg.aggregator, channel_dim=1)
+        if 'spec_augment' in cfg and cfg.spec_augment is not None:
+            self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+
+        self.encoder_cfg = cfg.encoders
+        if not isinstance(self.encoder_cfg, DictConfig):
+            raise TypeError(f"cfg.encoders must be a DictConfig, got {type(cfg.encoders)}")
+
+        preprocessor = {}
+        encoders = {}
+        for key, enc_cfg in self.encoder_cfg.items():
+            encoder = self.from_config_dict(enc_cfg.model)
+            if enc_cfg.get("use_multi_layer_feat", False) and enc_cfg.get("multi_layer_feat", None):
+                if not isinstance(encoder, ConformerEncoder):
+                    raise TypeError(
+                        f"Encoder {key} must be a ConformerEncoder when use_multi_layer_feat is True, got {type(encoder)}"
+                    )
+                if "_target_" in enc_cfg.multi_layer_feat.aggregator:
+                    aggregator = self.from_config_dict(enc_cfg.multi_layer_feat.aggregator)
+                else:
+                    aggregator = MultiFeatureAggregator(enc_cfg.multi_layer_feat.aggregator, channel_dim=1)
+                encoder = ConformerMultiLayerFeatureExtractor(
+                    encoder=encoder, layer_idx_list=enc_cfg.multi_layer_feat.layer_idx_list, aggregator=aggregator
+                )
+            encoders[key] = encoder
+            preprocessor[key] = (
+                self.from_config_dict(enc_cfg.get("preprocessor"))
+                if enc_cfg.get("preprocessor", None) is not None
+                else None
+            )
+        self.encoders = nn.ModuleDict(encoders)
+        self.preprocessor = nn.ModuleDict(preprocessor)
+
+        self.speaker_model = None
+        self.speaker_seg_len = None
+        if "speaker_model" in cfg and cfg.speaker_model.get("model", None) is not None:
+            self.speaker_model = EncDecSpeakerLabelModel(cfg=cfg.speaker_model.model)
+            self.speaker_model.spec_augmentation = self.spec_augmentation
+            self.speaker_seg_len = 1
+            if "preprocessor" in cfg.speaker_model.model:
+                self.speaker_seg_len = int(
+                    cfg.speaker_model.segment_length_in_secs // cfg.speaker_model.model.preprocessor.window_stride
+                )
+        self.ref_model = cfg.get("ref_model", None)
+        if self.ref_model is not None:
+            if self.ref_model not in self.encoders and (
+                self.ref_model != "speaker_model" and self.speaker_model is not None
+            ):
+                if self.ref_model == "speaker_model":
+                    raise ValueError(f"ref_model is `{self.ref_model}` but speaker_model is None")
+                raise ValueError(f"ref_model `{self.ref_model}` not found in encoders [{encoders.keys()}]")
+
+        self.modality_adapter = self.from_config_dict(cfg.modality_adapter)
+        if 'output_dim' not in cfg.modality_adapter and "d_model" in cfg.modality_adapter:  # e.g., conformer encoder
+            self.proj = nn.Linear(cfg.modality_adapter.d_model, cfg.output_dim)
+        else:
+            self.proj = nn.Identity()
+
+    def maybe_preprocess_audio(
+        self,
+        preprocessor,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal and preprocessor is not None:
+            processed_signal, processed_signal_length = preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        elif not has_processed_signal and preprocessor is None:
+            processed_signal, processed_signal_length = input_signal, input_signal_length
+        return processed_signal, processed_signal_length
+
+    def forward_speaker(
+        self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.speaker_model.preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        # Spec augment is not applied during evaluation/testing
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        # encoded has shape [B, D, T], length has shape [B]
+        encoded, encoded_len = self.speaker_model.encoder(
+            audio_signal=processed_signal, length=processed_signal_length
+        )
+
+        # pad encoded to be divisible by speaker_seg_len
+        if encoded.shape[2] % self.speaker_seg_len != 0:
+            encoded = torch.cat(
+                [
+                    encoded,
+                    torch.zeros(
+                        encoded.shape[0],
+                        encoded.shape[1],
+                        self.speaker_seg_len - encoded.shape[2] % self.speaker_seg_len,
+                        device=encoded.device,
+                    ),
+                ],
+                dim=2,
+            )
+
+        B, D, T = encoded.shape
+        num_seg = int(T // self.speaker_seg_len)
+        encoded = encoded.view(int(B * num_seg), D, self.speaker_seg_len)  # [B*num_seg, D, seg_len]
+        encoded_len_seg = (encoded_len // self.speaker_seg_len).repeat_interleave(num_seg)  # [B*seg_len]
+
+        _, embeds = self.speaker_model.decoder(encoder_output=encoded, length=encoded_len_seg)
+
+        embeds = embeds.view(B, -1, num_seg)  # [B, D, num_seg]
+
+        embeds_len = encoded_len // self.speaker_seg_len  # [B]
+        return embeds, embeds_len
+
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        encoded_list = []
+        encoded_len_list = []
+        ref_idx = None
+        for key, encoder in self.encoders.items():
+            curr_processed_signal, curr_processed_signal_length = self.maybe_preprocess_audio(
+                self.preprocessor[key], input_signal, input_signal_length, processed_signal, processed_signal_length
+            )
+            # Spec augment is not applied during evaluation/testing
+            if self.spec_augmentation is not None and self.training:
+                processed_signal = self.spec_augmentation(
+                    input_spec=curr_processed_signal, length=curr_processed_signal_length
+                )
+            encoded, encoded_len = encoder(audio_signal=curr_processed_signal, length=curr_processed_signal_length)
+            if key == self.ref_model:
+                ref_idx = len(encoded_list)
+            encoded_list.append(encoded)
+            encoded_len_list.append(encoded_len)
+
+        if self.speaker_model is not None:
+            speaker_embeds, speaker_embeds_len = self.forward_speaker(
+                input_signal=input_signal,
+                input_signal_length=input_signal_length,
+                processed_signal=processed_signal,
+                processed_signal_length=processed_signal_length,
+            )
+            encoded_list.append(speaker_embeds)
+            encoded_len_list.append(speaker_embeds_len)
+        encoded_list, encoded_len_list = self.aggregator(
+            encoded=encoded_list, encoded_len=encoded_len_list, ref_idx=ref_idx
+        )
+        encoded, encoded_len = self.modality_adapter(audio_signal=encoded_list, length=encoded_len_list)
+        # b, c, t -> b, t, c
+        encoded = self.proj(encoded.transpose(1, 2))
+        return encoded, encoded_len
+
+
+def lens_to_mask(lens, max_length):
+    batch_size = lens.shape[0]
+    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    return mask
+
+
+class TransformerCrossAttention(NeuralModule, Exportable):
+    """Transformer module for cross-attention between speech and text embeddings.
+    The module allows optional projection from the input embeddings to a lower dimension before feeding them to the transformer.
+    Args:
+        cfg: DictConfig, configuration object for the module which should include:
+            xattn: DictConfig, configuration object for the transformer decoder
+    """
+
+    def __init__(self, cfg: DictConfig, *args, **kwargs):
+        super().__init__()
+        xformer_num_layers = cfg.xattn.get('xformer_num_layers', 2)
+        xformer_dims = cfg.xattn.get('xformer_dims', cfg.output_dim)
+        self.cfg = cfg
+        cross_attn_cfg = cfg.xattn
+        if xformer_dims != cfg.output_dim:
+            self.input_proj1 = nn.Linear(cfg.output_dim, xformer_dims)
+            self.input_proj2 = nn.Linear(cfg.output_dim, xformer_dims)
+            self.output_proj = nn.Linear(xformer_dims, cfg.output_dim)
+        else:
+            self.input_proj1 = nn.Identity()
+            self.input_proj2 = nn.Identity()
+            self.output_proj = nn.Identity()
+        # causal attention decoder by default
+        self.xattn_decoder = TransformerDecoder(
+            hidden_size=xformer_dims,
+            num_layers=xformer_num_layers,
+            inner_size=1 * xformer_dims,
+            num_attention_heads=cross_attn_cfg.num_attention_heads,
+            ffn_dropout=cross_attn_cfg.ffn_dropout,
+            attn_score_dropout=cross_attn_cfg.attn_score_dropout,
+            attn_layer_dropout=cross_attn_cfg.attn_layer_dropout,
+            hidden_act=cross_attn_cfg.hidden_act,
+            pre_ln=cross_attn_cfg.pre_ln,
+            pre_ln_final_layer_norm=cross_attn_cfg.pre_ln_final_layer_norm,
+        )
+
+    def forward(
+        self,
+        encoder_states,
+        encoded_len,
+        input_embeds,
+        input_lengths,
+        decoder_mems_list=None,
+        return_mems=False,
+    ):
+        assert input_embeds.shape[-1] == encoder_states.shape[-1]
+        enc_mask = lens_to_mask(encoded_len, encoder_states.shape[1]).to(encoder_states.dtype)
+        dec_mask = lens_to_mask(input_lengths, input_embeds.shape[1]).to(input_lengths.dtype)
+        y = self.xattn_decoder(
+            decoder_states=self.input_proj1(input_embeds),
+            decoder_mask=dec_mask,
+            encoder_states=self.input_proj2(encoder_states),
+            encoder_mask=enc_mask,
+            decoder_mems_list=decoder_mems_list,
+            return_mems=return_mems,
+            return_mems_as_list=False,
+        )
+        if return_mems:
+            extra_outpus = {'decoder_mems_list': y}
+            y = y[-1][:, -input_embeds.shape[1] :]
+        else:
+            extra_outpus = {}
+        y = self.output_proj(y) + input_embeds
+        assert y.shape == input_embeds.shape
+        return y, extra_outpus
diff --git a/nemo/collections/multimodal/speech_llm/parts/__init__.py b/nemo/collections/multimodal/speech_llm/parts/__init__.py
new file mode 100644
index 000000000000..d0c4b8bd282c
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    ceil_to_nearest,
+    get_num_samples_from_files,
+    maybe_cast_to_list,
+    shift_tokens_by_multi_audios,
+)
diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py b/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
new file mode 100644
index 000000000000..6071bda87057
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig
+from nemo.utils import logging
+
+
+class SpeechLLMAdapterMixin(NLPAdapterModelMixin):
+    def load_adapters(
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
+    ):
+        """
+        Utility method that restores only the adapter module(s), and not the entire model itself.
+        This allows the sharing of adapters which are often just a fraction of the size of the full model,
+        enabling easier delivery.
+
+        .. note::
+
+            During restoration, assumes that the model does not currently already have one or more adapter modules.
+
+        Args:
+            filepath: Filepath of the .ckpt or .nemo file.
+            peft_cfgs: One or more PEFTConfig objects that specify the PEFT method configuration.
+                If none, will infer from the .nemo checkpoint
+            map_location: Pytorch flag, where to place the adapter(s) state dict(s).
+        """
+
+        # Determine device
+        if map_location is None:
+            if torch.cuda.is_available():
+                map_location = 'cuda'
+            else:
+                map_location = 'cpu'
+
+        if filepath.endswith('.nemo'):
+            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
+        elif filepath.endswith('.ckpt'):
+            state_dict = torch.load(filepath, map_location)['state_dict']
+        else:
+            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        if not peft_cfgs:
+            assert filepath.endswith(
+                '.nemo'
+            ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
+            peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
+        self.add_adapter(peft_cfgs)
+        if not self.ptuning_only_and_non_first_stage:
+            target_keys = self.adapter_keys.union(self.tunable_base_param_keys)
+            if set(state_dict.keys()) != target_keys:
+                logging.warning(
+                    f"Unexpected keys found in state_dict: {set(state_dict.keys()) - target_keys}, missing keys in state_dict: {target_keys - set(state_dict.keys())}"
+                )
+        super(MegatronGPTModel, self).load_state_dict(state_dict, strict=False)
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py b/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
new file mode 100644
index 000000000000..d638281950b4
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
@@ -0,0 +1,382 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import numpy as np
+import torch
+from nemo.utils import logging, logging_mode
+
+
+def maybe_cast_to_list(x):
+    if isinstance(x, np.ndarray):
+        return [item.tolist() for item in x]
+    return x
+
+
+def ceil_to_nearest(n, m):
+    return (n + m - 1) // m * m
+
+
+def get_num_samples_from_files(file_list):
+    if isinstance(file_list, str):
+        file_list = file_list.split(',')
+    num_samples = []
+    for file in file_list:
+        with open(file, 'r') as f:
+            lines = list(f.readlines())
+            num = len(lines)
+            if lines[-1] == '\n':
+                num -= 1
+            num_samples.append(num)
+    return num_samples
+
+
+def shift_tokens_by_multi_audios(
+    context_tokens, context_lengths, audio_feat_lens, context_start_idx, encoder_max_length
+):
+    """
+    split and shift the context tokens by the audio segments, then concatenate them back. This function assumes that the whole context
+    starts and ends with text tokens, and the audio segments are in between the text tokens. The audio segments are not allowed to be adjacent to each other.
+    Args:
+        context_tokens: tensor of shape [batch, max_context_len]
+        context_lengths: tensor of shape [batch,]
+        audio_feat_lens: List[List[int]]
+        context_start_idx: List[List[int]]
+        encoder_max_length: int
+    """
+    new_context_tokens = []
+    for i in range(context_tokens.shape[0]):
+        start_idx_list_i = context_start_idx[i] + [context_lengths[i]]
+        input_len_list = [start_idx_list_i[j + 1] - start_idx_list_i[j] for j in range(len(start_idx_list_i) - 1)]
+        context_tokens_list = context_tokens[i][: context_lengths[i]].split(input_len_list)
+        context_tokens_i = [context_tokens_list[0]]
+        for j in range(1, len(context_tokens_list)):
+            context_tokens_i.append(
+                torch.zeros(audio_feat_lens[i][j - 1], dtype=torch.long, device=context_tokens.device)
+            )
+            context_tokens_i.append(context_tokens_list[j])
+        context_tokens_i = torch.cat(context_tokens_i)
+        context_tokens_i = torch.nn.functional.pad(
+            context_tokens_i, (0, encoder_max_length - context_tokens_i.shape[0])
+        )
+        new_context_tokens.append(context_tokens_i)
+    new_context_tokens = torch.stack(new_context_tokens)
+    return new_context_tokens
+
+
+def get_nested_dict_value(d, key, sep="."):
+    """
+    Get the value of a nested dict given a key
+    Args:
+        d: dict
+        key: str
+    """
+    for k in key.split(sep):
+        d = d[k]
+    return d
+
+
+def align_feat_seq_list(
+    seq_list: List[torch.Tensor],
+    seq_len_list: List[torch.Tensor],
+    mode: str = "min",
+    pooling: str = 'mean',
+    target_len: Optional[int] = None,
+):
+    """
+    Align a list of feature sequences to the same length by repeating or discarding frames.
+    Args:
+        seq_list: List[torch.Tensor], list of tensors of shape [batch, hidden_size, seq_len]
+        seq_len_list: List[torch.Tensor], list of tensors of shape [batch,]
+        mode: str, "min" or "max"
+        pooling: str, "mean", "max", or "min"
+    Returns:
+        new_seq_list: List[torch.Tensor], list of tensors of shape [batch, hidden_size, new_seq_len]
+        new_seq_len_list: List[torch.Tensor], list of tensors of shape [batch,]
+    """
+    MODES = ["min", "max"]
+    if mode not in MODES:
+        raise ValueError(f"mode {mode} not supported, available modes: {MODES}")
+    POOLING = ["mean", "max", "min", "avg"]
+    if pooling not in POOLING:
+        raise ValueError(f"pooling {pooling} not supported, available modes: {POOLING}")
+
+    new_seq_len_list = []
+    new_seq_list = []
+
+    if target_len is None:
+        target_len = [x.size(-1) for x in seq_list]
+        target_len = min(target_len) if mode == "min" else max(target_len)
+
+    for seq, seq_len in zip(seq_list, seq_len_list):
+        curr_len = seq.size(-1)
+        if curr_len > target_len:
+            ratio = round(curr_len / target_len)
+            res = abs(ratio * target_len - curr_len)
+            if ratio * target_len > curr_len:  # e.g., ratio = 1.9
+                # repeat the last res frames
+                seq = torch.cat([seq, seq[:, :, -res:]], dim=-1)
+                seq_len += res * (seq_len > target_len).long()
+            elif ratio * target_len < curr_len:  # e.g., ratio = 2.1
+                # discard the last res frames
+                seq = seq[:, :, :-res]
+                seq_len -= res * (seq_len > target_len).long()
+            new_seq = seq.reshape(seq.size(0), seq.size(1), ratio, target_len)
+            if pooling == "min":
+                new_seq = new_seq.min(dim=2)
+            elif pooling == "max":
+                new_seq = new_seq.max(dim=2)
+            else:
+                new_seq = new_seq.mean(dim=2)
+            new_seq_len = torch.round(seq_len / ratio).long()
+        else:  # curr_len <= target_len
+            ratio = round(target_len / curr_len)
+            res = abs(ratio * curr_len - target_len)
+            new_seq = torch.repeat_interleave(seq, ratio, dim=-1)
+            new_seq_len = seq_len * ratio
+            if ratio * curr_len > target_len:  # e.g., ratio = 1.9
+                new_seq = new_seq[:, :, :target_len]
+                new_seq_len = (
+                    seq_len * ratio - (ratio * seq_len - target_len) * (ratio * seq_len > target_len).long()
+                )  # subtract additional frames
+            elif ratio * curr_len < target_len:  # e.g., ratio = 2.1
+                new_seq = torch.cat([new_seq, seq[:, :, -res:]], dim=-1)
+        new_seq_list.append(new_seq)
+        new_seq_len_list.append(new_seq_len)
+    return new_seq_list, new_seq_len_list
+
+
+def build_loss_mask(processed_example: dict, answer_only_loss: bool = True):
+    """Pad input_ids in batch to max batch length while building loss mask"""
+    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+    input_ids = processed_example['input_ids']
+    answer_start_idx = processed_example['answer_start_idx']
+    if answer_only_loss:
+        loss_mask = [float(idx >= answer_start_idx) for idx in range(len(input_ids))]
+    else:
+        loss_mask = [1.0] * len(input_ids)
+
+    return loss_mask
+
+
+class TextProcessing:
+    """
+    Text processing pipeline for speech_llm data loader.
+    This class is adapted from the one used in nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+    The class follows the interface of _process_example which takes in a context and an output
+      and processes them into a formatted training example.
+
+    Args:
+        tokenizer: text tokenizer object
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        sep_id (int): The id of the separation token
+        separate_prompt_and_response_with_newline (bool): Whether to separate the prompt and response with a newline character
+        answer_only_loss (bool): Whether to compute the loss only on the answer part of the input
+        truncation_field (str): Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length (bool): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template (str): Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        virtual_tokens (int): Number of virtual tokens to add to the beginning of the input
+        tokens_to_generate (int): Number of tokens to generate during inference
+        context_key (str): Key to use for the context in your JSONL file
+        answer_key (str): Key to use for the label in your JSONL file
+        end_string (Optional[str]): If not None, add this string to the end of the answer.
+        sample_alpha (Optional[float]): For SPE subword sampling
+        input_text_mask_ratio (Optional[float]): If not None, will mask the input text at this ratio.
+    """
+
+    def __init__(
+        self,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: Optional[int] = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        sample_alpha: Optional[float] = None,
+        audio_locator: Optional[str] = None,
+    ):
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.seed = seed
+        self.separate_prompt_and_response_with_newline = separate_prompt_and_response_with_newline
+        self.answer_only_loss = answer_only_loss
+        self.truncation_field = truncation_field
+        self.pad_to_max_length = pad_to_max_length
+        self.prompt_template = prompt_template
+        self.virtual_tokens = virtual_tokens
+        self.tokens_to_generate = tokens_to_generate
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.add_sep = add_sep
+        self.end_string = end_string
+        self.sample_alpha = sample_alpha
+        self.audio_locator = audio_locator
+
+        if add_bos and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            self.bos_id = tokenizer.bos_id
+        else:
+            self.bos_id = None
+
+        if add_eos and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            self.eos_id = tokenizer.eos_id
+        else:
+            self.eos_id = None
+
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            self.pad_id = tokenizer.pad_id
+        else:
+            self.pad_id = self.eos_id if self.eos_id is not None else 0
+
+        self.sep_id = sep_id if add_sep else None
+
+        if self.prompt_template is not None:
+            # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them.
+            self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
+        assert self.truncation_field in ["answer", "context"]
+
+    def _process_example(self, context: str, output: str):
+        """
+        Create an example by concatenating text and answer.
+        Truncation is carried out when needed, but it is performed only on the prompt side.
+        BOS, EOS, and SEP, are added if specified.
+
+        function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+        """
+        if self.prompt_template is not None:
+            if self.context_key not in self.prompt_template or self.answer_key not in self.prompt_template:
+                if "input" in self.prompt_template and "output" in self.prompt_template:
+                    logging.warning(
+                        f"Using 'input' and 'output' as context and answer keys, since given ones ({self.context_key}, {self.answer_key}) are not found in the prompt template: {self.prompt_template}.",
+                        mode=logging_mode.ONCE,
+                    )
+                    self.context_key = "input"
+                    self.answer_key = "output"
+            assert f'{{{self.context_key}}}' in self.prompt_template
+            assert f'{{{self.answer_key}}}' in self.prompt_template
+            # Make sure that '{output}' always occurs at the end of the prompt template string
+            assert self.prompt_template.index(f'{{{self.answer_key}}}') == len(self.prompt_template) - len(
+                f'{{{self.answer_key}}}'
+            )
+            # Get the context by replacing only the input
+            original_context = context
+            context = (
+                self.prompt_template.replace(f'{{{self.context_key}}}', context)
+                .replace(f'{{{self.answer_key}}}', '')
+                .strip(' ')
+            )
+            # Replace the input and output placeholders with the actual input and output
+            text = self.prompt_template.replace(f'{{{self.context_key}}}', original_context).replace(
+                f'{{{self.answer_key}}}', output
+            )
+
+        elif self.separate_prompt_and_response_with_newline:
+            text = context + '\n' + output
+        else:
+            text = context + ' ' + output
+
+        if self.virtual_tokens:
+            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
+            # these pad/eos tokens are placeholders for virtual tokens
+            pre_pad = [self.tokenizer.eos_id] * self.virtual_tokens
+        else:
+            pre_pad = []
+        answer_text = text[len(context) :]
+        answer_ids = pre_pad + self.tokenizer.text_to_ids(answer_text, self.sample_alpha)
+        if self.end_string:
+            answer_ids += self.tokenizer.text_to_ids(self.end_string)
+
+        if self.audio_locator is None:
+            # signle audio case
+            context_ids = self.tokenizer.text_to_ids(context)
+            context_start_idx = [0]
+        else:
+            # multiple audio case
+            context_ids = []
+            context_start_idx = []
+            for context_seg in context.split(self.audio_locator):
+                context_start_idx.append(len(context_ids))
+                context_ids.extend(self.tokenizer.text_to_ids(context_seg))
+        context_ids = pre_pad + context_ids
+        context_start_idx = [x + len(pre_pad) for x in context_start_idx]
+
+        # for the long context cases, collate_fn includes self.tokens_to_generate for padding
+        total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
+        if self.add_bos:
+            total_ids += 1
+        if self.add_sep:
+            total_ids += 1
+        if self.add_eos:
+            total_ids += 1
+
+        # If the total number of token is greater than the max, we will try to truncate the answer
+        if total_ids > self.max_seq_length:
+            truncation_length = total_ids - self.max_seq_length
+            answer_ids = answer_ids[: -min(truncation_length, len(answer_ids))]
+            context_ids = context_ids[: -min(truncation_length, len(context_ids))]
+
+        input_ids = context_ids
+        answer_start_idx = len(input_ids)
+
+        # Adds bos token in the start
+        if self.add_bos:
+            context_ids = [self.bos_id] + context_ids
+            input_ids = [self.bos_id] + input_ids
+            answer_start_idx += 1
+
+        # Adds sep token between text/prompt and answer
+        if self.add_sep:
+            context_ids = context_ids + [self.sep_id]
+            input_ids = input_ids + [self.sep_id]
+            answer_start_idx += 1
+
+        input_ids = input_ids + answer_ids
+
+        if self.add_eos:
+            input_ids = input_ids + [self.tokenizer.eos_id]
+            answer_ids = answer_ids + [self.tokenizer.eos_id]
+
+        if len(input_ids) > self.max_seq_length:
+            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            input_ids = input_ids[: self.max_seq_length]
+
+        processed_example = {
+            'input_ids': (input_ids),
+            'answer_start_idx': (answer_start_idx),
+            'context_ids': (context_ids),
+            'context_length': len(context_ids),
+            'answer_ids': (answer_ids),
+            'context_start_idx': context_start_idx,
+        }
+
+        return processed_example
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 474761c41d67..358dbc22a2cd 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+from nemo.utils import logging
 
 try:
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
@@ -36,8 +37,8 @@
 class GPTFIMDatasetConfig(GPTDatasetConfig):
     """Configuration object for Megatron Core GPT FIM datasets
 
-        Attributes:
-            fim: fill in the middle parameters config
+    Attributes:
+        fim: fill in the middle parameters config
     """
 
     def __init__(self, fim, **kwargs):
@@ -79,6 +80,27 @@ def __init__(
         super().__init__(indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config)
 
         self.indexed_dataset = indexed_dataset
+        self.np_rng = np.random.RandomState(seed=self.config.random_seed)
+        logging.info(f"Initialized FIM RNG with seed = {self.config.random_seed}")
+        # get FIM params
+        self.fim_rate = self.config.fim.get('rate', 0.5)
+        self.fim_spm_rate = self.config.fim.get('spm_rate', 0.5)
+        self.fragment_fim_rate = self.config.fim.get('fragment_rate', 0.5)
+        split_sample = self.config.fim.get('split_sample', None)
+        self.fim_split_sample = self.config.tokenizer.tokens_to_ids(split_sample) if split_sample else None
+        self.no_fim_prefix = self.config.fim.get('no_prefix', None)
+
+        # get extra tokens ids
+        fim_tokens = self.config.fim.extra_tokens
+        fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
+        fim_tokens_ids = self.config.tokenizer.tokens_to_ids(fim_tokens)
+        (
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.suffix_tok_id,
+            self.pad_tok_id,
+            self.eod_tok_id,
+        ) = fim_tokens_ids
 
     def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
         """Get the text (token ids) and document ids for a given index
@@ -126,29 +148,9 @@ def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray,
 
         sample = np.concatenate(sample_parts)
 
-        # get FIM params
-        self.fim_rate = self.config.fim.get('rate', 0.5)
-        self.fim_spm_rate = self.config.fim.get('spm_rate', 0.5)
-        self.fragment_fim_rate = self.config.fim.get('fragment_rate', 0.5)
-        split_sample = self.config.fim.get('split_sample', None)
-        self.fim_split_sample = self.config.tokenizer.tokens_to_ids(split_sample) if split_sample else None
-        self.no_fim_prefix = self.config.fim.get('no_prefix', None)
-
-        # get extra tokens ids
-        fim_tokens = self.config.fim.extra_tokens
-        fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
-        fim_tokens_ids = self.config.tokenizer.tokens_to_ids(fim_tokens)
-        (
-            self.prefix_tok_id,
-            self.middle_tok_id,
-            self.suffix_tok_id,
-            self.pad_tok_id,
-            self.eod_tok_id,
-        ) = fim_tokens_ids
-
         sample_len = sample.shape[0]
         segment_breaks = np.argwhere(sample == self.eod_tok_id)
-        np_rng = np.random.RandomState(seed=self.config.random_seed)
+        np_rng = self.np_rng
 
         if segment_breaks.shape != (0, 1):  # then there is an EOD token in this example
             curr_start_position = 0
@@ -245,7 +247,7 @@ def _permute(
         no_fim_prefix=None,
     ):
         """
-        Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
+        Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it.
         Maintain the same sample length (if transform creates a few extra tokens, drop them).
         """
         if np_rng.binomial(1, fim_rate):  # sample bernoulli dist
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 501c766374e1..e16543a7568d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import re
 from typing import List, Mapping, Optional
 
@@ -60,6 +61,8 @@ def __init__(
         special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
         is_test: bool = False,
         output_original_text: bool = False,
+        ceil_to_power_2: bool = False,
+        get_attention_mask_from_fusion: bool = False,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -98,7 +101,7 @@ def __init__(
         self.seed = seed
         self.label_key = label_key
         self.answer_only_loss = answer_only_loss
-        self.truncation_fields = truncation_field.split(',')
+        self.truncation_fields = truncation_field.split(',') if truncation_field is not None else []
         self.pad_to_max_length = pad_to_max_length
         self.index_mapping_dir = index_mapping_dir
         self.prompt_template = prompt_template
@@ -109,6 +112,9 @@ def __init__(
         self.truncation_method = truncation_method
         self.is_test = is_test
         self.output_original_text = output_original_text
+        self.ceil_to_power_2 = ceil_to_power_2
+        self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
+
         if special_tokens is None:
             self.special_tokens = {
                 "system_turn_start": "<extra_id_0>",
@@ -160,8 +166,9 @@ def _maybe_validate_prompt_template(self):
         ), f'{label_placeholder} must be at the end of prompt_template.'
 
         # Legacy checkpoints has self.truncation_fields = ['context'] and self.prompt_template_keys = ['input', 'output']
-        if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
-            self.truncation_fields[0] = self.prompt_template_keys[0]
+        if len(self.truncation_fields) > 0:
+            if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
+                self.truncation_fields[0] = self.prompt_template_keys[0]
 
         assert set(self.truncation_fields).issubset(
             self.prompt_template_keys
@@ -299,32 +306,61 @@ def _multiple_truncation(self, template_ids: List[List[int]], template_ids_keys:
         if total_ids > self.max_seq_length:
             truncation_length_total = total_ids - self.max_seq_length
             num_fields = len(self.truncation_fields)
-            # sorted equal divide length to each field
-            # examples:
-            #   truncation_length_total = 3
-            #   num_fields = 11
-            #   truncation_length_list = [3,4,4]
-            truncation_length_list = [
-                truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
-                for i in range(num_fields)[::-1]
-            ]
-
-            for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
-                if key in self.truncation_fields:
-                    truncation_length = truncation_length_list.pop()
-                    if len(ids) < truncation_length:
-                        logging.warning(f'{key} is not long enough to truncate.')
-                        truncation_length = len(ids)
-
-                    if self.truncation_method == 'left':
-                        window_offset = truncation_length
-                    elif self.truncation_method == 'right':
-                        window_offset = 0
-                    else:
-                        raise ValueError(f'{self.truncation_method} is not supported')
+            if num_fields > 0:
+                # sorted equal divide length to each field
+                # examples:
+                #   truncation_length_total = 3
+                #   num_fields = 11
+                #   truncation_length_list = [3,4,4]
+                truncation_length_list = [
+                    truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
+                    for i in range(num_fields)[::-1]
+                ]
 
-                    window_length = len(ids) - truncation_length
-                    template_ids[i] = ids[window_offset : window_offset + window_length]
+                for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
+                    if key in self.truncation_fields:
+                        truncation_length = truncation_length_list.pop()
+                        if len(ids) < truncation_length:
+                            logging.warning(f'{key} is not long enough to truncate.')
+                            truncation_length = len(ids)
+
+                        if self.truncation_method == 'left':
+                            window_offset = truncation_length
+                        elif self.truncation_method == 'right':
+                            window_offset = 0
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+
+                        window_length = len(ids) - truncation_length
+                        template_ids[i] = ids[window_offset : window_offset + window_length]
+            else:
+                # If truncation_field is empty, we truncate template_ids (List[List[int]]) to make total ids < self.max_seq_length.
+                logging.warning(
+                    f'`truncation_field` is empty, we truncate input from {self.truncation_method} based on truncation_method.'
+                )
+                template_ids_lengths = [len(ids) for ids in template_ids]
+                if self.truncation_method == 'left':
+                    iters = range(0, len(template_ids_lengths), 1)
+                elif self.truncation_method == 'right':
+                    iters = range(len(template_ids_lengths) - 1, -1, -1)
+                else:
+                    raise ValueError(f'{self.truncation_method} is not supported')
+
+                # Iterate all lengths of template_ids.
+                for i in iters:
+                    if template_ids_lengths[i] >= truncation_length_total:
+                        template_ids_lengths[i] -= truncation_length_total
+                        if self.truncation_method == 'left':
+                            template_ids[i] = template_ids[i][-template_ids_lengths[i] :]
+                        elif self.truncation_method == 'right':
+                            template_ids[i] = template_ids[i][: template_ids_lengths[i]]
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+                        break
+                    else:
+                        truncation_length_total -= template_ids_lengths[i]
+                        template_ids_lengths[i] = 0
+                        template_ids[i] = []
 
         context_ids = [i for ids in template_ids[:-1] for i in ids]
         label_ids = template_ids[-1]
@@ -356,31 +392,30 @@ def _process_example(self, example):
             # these pad/eos tokens are placeholders for virtual tokens
             context_ids = [self.tokenizer.eos_id] * self.virtual_tokens + context_ids
 
-        input_ids = context_ids
-        answer_start_idx = len(input_ids)
-
         # Adds bos token in the start
         if self.add_bos:
             context_ids = [self.tokenizer.bos_id] + context_ids
-            input_ids = [self.tokenizer.bos_id] + input_ids
-            answer_start_idx += 1
 
         # Adds sep token between text/prompt and answer
         if self.add_sep:
             context_ids = context_ids + [self.sep_id]
-            input_ids = input_ids + [self.sep_id]
-            answer_start_idx += 1
 
-        input_ids = input_ids + answer_ids
+        input_ids = context_ids + answer_ids
 
         # Only training need to consider eos token
         if self.add_eos:
             input_ids = input_ids + [self.tokenizer.eos_id]
 
         if len(input_ids) > self.max_seq_length:
-            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            # this only happens if tuncation_field is not enough to truncate.
+            # context_ids can be empty if we truncate contexts.
+            # answer_ids can be empty if we truncate answers.
+            logging.warning(
+                f'After truncation, input ids length {len(input_ids)} still exceeds max sequence length {self.max_seq_length}'
+            )
+            context_ids = context_ids[: self.max_seq_length]
             input_ids = input_ids[: self.max_seq_length]
-            answer_ids = input_ids[answer_start_idx:]
+            answer_ids = input_ids[len(context_ids) :]
 
         # store metadata in dataset, in case user may have keys required in the prediction json files
         metadata = {k: v for k, v in example.items() if k not in self.prompt_template_keys}
@@ -390,7 +425,7 @@ def _process_example(self, example):
 
         processed_example = {
             'input_ids': input_ids,
-            'answer_start_idx': answer_start_idx,
+            'answer_start_idx': len(context_ids),
             'context_ids': context_ids,
             'context_length': len(context_ids),
             'answer_ids': answer_ids,
@@ -406,7 +441,11 @@ def _maybe_cast_to_list(self, x):
         return x
 
     def _ceil_to_nearest(self, n, m):
-        return (n + m - 1) // m * m
+        if self.ceil_to_power_2:
+            # Reccurent Gemma (AKA Griffin) requires seq length to be a power of 2 for parallel scan
+            return 2 ** math.ceil(math.log2(n))
+        else:
+            return (n + m - 1) // m * m
 
     def _collate_item(self, item, max_length, pad_id):
         item = self._maybe_cast_to_list(item)
@@ -416,7 +455,7 @@ def _collate_item(self, item, max_length, pad_id):
         return item
 
     def _build_loss_mask(self, processed_example):
-        """ Pad input_ids in batch to max batch length while building loss mask """
+        """Pad input_ids in batch to max batch length while building loss mask"""
         input_ids = processed_example['input_ids']
         answer_start_idx = processed_example['answer_start_idx']
         if self.answer_only_loss:
@@ -456,8 +495,9 @@ def collate_fn(self, batch):
             max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, self.pad_seq_length_to_mult))
         assert max_length <= self.max_seq_length
 
-        attention_mask = [self._create_attention_mask(max_length) for _ in batch]
-        attention_mask = torch.stack(attention_mask)
+        if not self.get_attention_mask_from_fusion:
+            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
+            attention_mask = torch.stack(attention_mask)
         position_ids = [list(range(max_length)) for _ in batch]
         position_ids = torch.LongTensor(position_ids)
         input_ids = torch.LongTensor(
@@ -471,7 +511,6 @@ def collate_fn(self, batch):
         processed_batch = {
             'tokens': input_ids,
             'labels': labels,
-            'attention_mask': attention_mask,
             'loss_mask': loss_mask,
             'position_ids': position_ids,
             'contexts': contexts,
@@ -481,6 +520,9 @@ def collate_fn(self, batch):
             'token_count': token_count,
         }
 
+        if not self.get_attention_mask_from_fusion:
+            processed_batch['attention_mask'] = attention_mask
+
         return processed_batch
 
 
@@ -628,7 +670,9 @@ def collate_fn(self, batch):
         else:
             attention_mask = [self._create_attention_mask(max_length) for _ in batch]
             processed_batch.update(
-                {'attention_mask': torch.stack(attention_mask),}
+                {
+                    'attention_mask': torch.stack(attention_mask),
+                }
             )
 
         return processed_batch
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
index 0f8d3410398d..5d227ff23342 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
@@ -47,8 +47,8 @@
 
     HAVE_MEGATRON_CORE = True
 
-except (ImportError, ModuleNotFoundError):
-
+except (ImportError, ModuleNotFoundError) as e:
+    print(e)
     HAVE_MEGATRON_CORE = False
 
 
@@ -122,7 +122,11 @@ def __getitem__(self, idx):
 
 
 def build_train_valid_test_datasets(
-    cfg, retro_config: RetroConfig, train_valid_test_num_samples, seq_length, tokenizer,
+    cfg,
+    retro_config: RetroConfig,
+    train_valid_test_num_samples,
+    seq_length,
+    tokenizer,
 ):
 
     # gpt dataset
@@ -135,7 +139,10 @@ def build_train_valid_test_datasets(
     }
 
     retro_train_ds, retro_valid_ds, retro_test_ds = get_retro_datasets(
-        config=retro_config, gpt_datasets=gpt_datasets, sample_length=seq_length, eod_token_id=tokenizer.eos_id,
+        config=retro_config,
+        gpt_datasets=gpt_datasets,
+        sample_length=seq_length,
+        eod_token_id=tokenizer.eos_id,
     )
 
     train_ds = (
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
index 72f4fd0e12a1..f0efaf5cd1aa 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
@@ -252,7 +252,8 @@ def build_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 5ed0da009cf2..fb8ec9554a95 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -72,10 +72,10 @@ def load_data(self, dataset):
         """
         Loads a dataset by filling in the task templates specified in the config file
         with the information from each training/inference example. Converts all input 
-        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in 
+        text into token ids. Also replaces the ``<|VIRTUAL_PROMPT_#|>`` placeholders in
         the task templates with the actual virtual prompt token ids. 
 
-        params:
+        Args:
             dataset: A list of json objects or a dictionary objects each
                      containing the information needed for a training example
         """
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
index c2d19305cf03..485388d84343 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
@@ -25,6 +25,7 @@
 class UL2Dataset(T5Dataset):
     """ UL2 Dataset from https://arxiv.org/abs/2205.05131.
     Consists of three different objectives:
+
     1. Short span masking with small probabilities (ex: T5). Typically max ngram size of 5 with 0.15 mask prob.
     2. Extreme span masking with either large probabilities or large ngram sizes or both.
     3. Prefx-LM as in the T5 or LM-adapted T5 (prompt-tuning paper).
@@ -312,7 +313,8 @@ def build_extreme_masking_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index d974c8182234..102ab5ec0f84 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -182,9 +182,11 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 110e59494b52..67fd2b1b6c62 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -58,6 +58,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.temperature = self.cfg.get('temperature', 0.02)
         self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True)
         self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True)
+        if self.cfg.get("do_mrl", False):
+            min_mrl = self.cfg.get("min_mrl_dim", int(np.log2(32))) - 1
+            max_mrl = int(np.log2(self.cfg.hidden_size // 2))
+            self.mrl_dims = [2**i for i in range(max_mrl, min_mrl, -1)]
+        else:
+            self.mrl_dims = []
+
         assert (
             self.cfg.get("post_process", False) is False
         ), "post_process must be False to get hidden states in the loss_func"
@@ -123,8 +130,10 @@ def _build_dataset(self, data_cfg, is_train=True):
             _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
             num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
         else:
-            num_query_samples_per_dataset = [[None]] * len(data_cfg.query_file_names)
-            num_doc_samples_per_dataset = [[None]] * len(data_cfg.doc_file_names)
+            num_query_files = len(data_cfg.query_file_names) if data_cfg.query_file_names is not None else 0
+            num_doc_files = len(data_cfg.doc_file_names) if data_cfg.doc_file_names is not None else 0
+            num_query_samples_per_dataset = [[None]] * num_query_files
+            num_doc_samples_per_dataset = [[None]] * num_doc_files
 
         # Check dataset max_seq_legnth and max_position_embeddings size
         if (
@@ -174,6 +183,9 @@ def _build_dataset(self, data_cfg, is_train=True):
             )
             return dataset
         else:
+            if data_cfg.query_file_names is None or data_cfg.doc_file_names is None:
+                return []
+
             query_dataset = GPTEmbeddingDataset(
                 file_path=data_cfg.query_file_names[0],
                 tokenizer=self.tokenizer,
@@ -247,7 +259,14 @@ def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_me
         gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
         torch.distributed.all_gather_object(
             gathered_output_batches,
-            [{'q_hs': batch['q_hs'], 'd_hs': batch['d_hs'], 'metadata': batch['metadata'],} for batch in output],
+            [
+                {
+                    'q_hs': batch['q_hs'],
+                    'd_hs': batch['d_hs'],
+                    'metadata': batch['metadata'],
+                }
+                for batch in output
+            ],
             group=parallel_state.get_data_parallel_group(),
         )
 
@@ -264,7 +283,11 @@ def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_me
                 l_d_hs = listify(batch['d_hs'])
                 l_m = batch['metadata']
                 assert len(l_m) == len(l_q_hs) == len(l_d_hs)
-                for q_hs, d_hs, metadata in zip(l_q_hs, l_d_hs, l_m,):
+                for q_hs, d_hs, metadata in zip(
+                    l_q_hs,
+                    l_d_hs,
+                    l_m,
+                ):
                     total_size += 1
                     if not metadata.get("__AUTOGENERATED__", False):
                         deduplicated_outputs['q_hs'].append(q_hs)
@@ -318,10 +341,10 @@ def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
 
     def local_validation_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # Check if iterator is exhausted
         # dataloader_iter, done = self._val_iterator_done(dataloader_iter)
@@ -369,7 +392,7 @@ def local_validation_step(self, dataloader_iter):
 
         return loss, non_loss_tensors
 
-    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_possible_negatives=False):
+    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, temperature, use_all_possible_negatives=False):
         all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0)  # (2bs) x hidden_size
         cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1))  # (bs) x (2bs)
         pos_cs = cs[:, :bs].diag()
@@ -381,6 +404,8 @@ def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_poss
             cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1)
         pos_cs = pos_cs.clone().detach().mean()
         neg_cs = neg_cs.clone().detach().mean()
+        cs = cs.clamp(-1.0, 1.0)
+        cs = cs / temperature
         return cs, pos_cs, neg_cs, labels
 
     def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors):
@@ -418,11 +443,20 @@ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
         neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1)
 
         cs, pos_cs, neg_cs, labels = self.constrastive_scores(
-            pos_doc_hs, neg_doc_hs, query_hs, bs, self.use_all_possible_negatives
+            pos_doc_hs, neg_doc_hs, query_hs, bs, self.temperature, self.use_all_possible_negatives
         )
-        cs = cs.clamp(-1.0, 1.0)
-        cs = cs / self.temperature
         loss = torch.nn.functional.cross_entropy(cs, labels)
+        if self.mrl_dims:
+            for dim in self.mrl_dims:
+                cs_dim, _, _, _ = self.constrastive_scores(
+                    pos_doc_hs[:, :dim],
+                    neg_doc_hs[:, :dim],
+                    query_hs[:, :dim],
+                    bs,
+                    self.temperature,
+                    self.use_all_possible_negatives,
+                )
+                loss += torch.nn.functional.cross_entropy(cs_dim, labels)
 
         cp_size = self.cfg.get('context_parallel_size', 1)
         if cp_size > 1:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 02858b119bfa..6cce2b42be9c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -149,7 +149,7 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
+        attention_mask: torch.Tensor = None,
         encoder_output: Optional[torch.Tensor] = None,
         enc_dec_attn_mask: Optional[torch.Tensor] = None,
         inference_params: Optional[Any] = None,
@@ -169,7 +169,7 @@ def forward(
         with torch.autocast(device_type="cuda", dtype=self.dtype):
             return super().forward(
                 hidden_states,
-                attention_mask,
+                attention_mask=attention_mask,
                 encoder_output=encoder_output,
                 enc_dec_attn_mask=enc_dec_attn_mask,
                 inference_params=inference_params,
@@ -242,25 +242,30 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
     def forward(
         self,
         hidden_states,
-        attention_mask,
+        is_first_microbatch=None,
+        attention_mask=None,
         context=None,
         context_mask=None,
         rotary_pos_emb=None,
         inference_params=None,
         packed_seq_params=None,  # TODO: handle this
     ):
+        # Use is_first_microbatch argument during CUDA graph capture. Use self.is_first_microbatch otherwise.
         hidden_states = super().forward(
             hidden_states,
             attention_mask=attention_mask,
             encoder_output=context,
             enc_dec_attn_mask=context_mask,
             inference_params=inference_params,
-            is_first_microbatch=self.is_first_microbatch,
+            is_first_microbatch=is_first_microbatch if is_first_microbatch is not None else self.is_first_microbatch,
             # checkpoint_core_attention,
         )
         self.is_first_microbatch = False
         context = None
 
+        # CUDA graph requires returned values to be Tensors
+        if self.config.enable_cuda_graph and self.training:
+            return hidden_states
         return hidden_states, context
 
     def _get_layer_offset(self):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
similarity index 89%
rename from nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
rename to nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index e51ecaba463a..f75bd37f91f8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
 try:
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
@@ -36,8 +38,9 @@
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
-# Use this spec for AMMO PTQ and TensorRT-LLM export
-def get_gpt_layer_ammo_spec() -> ModuleSpec:
+
+# Use this spec for Model Optimizer PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec() -> ModuleSpec:
     """Mix the native spec with TENorm.
 
     This is essentially the native local spec except for the layernorm implementation
@@ -65,7 +68,11 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear,
+                    linear_fc2=RowParallelLinear,
+                ),
             ),
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py
new file mode 100755
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
new file mode 100755
index 000000000000..d8954ad1b3c3
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import Tensor, nn
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_layer_spec import (
+    griffin_mqa_layer_with_transformer_engine_spec,
+    griffin_recurrent_layer_with_transformer_engine_spec,
+)
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.models.common.language_module.language_module import LanguageModule
+    from megatron.core.packed_seq_params import PackedSeqParams
+    from megatron.core.transformer.custom_layers.transformer_engine import TENorm, te_checkpoint
+    from megatron.core.transformer.spec_utils import build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+
+def get_griffin_layers(num_layers):
+    dict_spec = {
+        "Recurrent_Layer": griffin_recurrent_layer_with_transformer_engine_spec,
+        "Attention_Layer": griffin_mqa_layer_with_transformer_engine_spec,
+    }
+
+    griffin_layers = []
+    for i in range(num_layers):
+        if i % 3 == 2:
+            griffin_layers.append(dict_spec["Attention_Layer"])
+        else:
+            griffin_layers.append(dict_spec["Recurrent_Layer"])
+
+    return griffin_layers
+
+
+def create_block(
+    config,
+    layer_spec,
+    layer_idx,
+):
+    block = build_module(
+        layer_spec,
+        config,
+    )
+    block.layer_number = layer_idx + 1
+    return block
+
+
+class GriffinStack(LanguageModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+    ):
+
+        super().__init__(config)
+        self.config = config
+        self.griffin_layers = get_griffin_layers(self.config.num_layers)
+
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    self.config,
+                    layer_spec,
+                    layer_idx=i,
+                )
+                for i, layer_spec in enumerate(self.griffin_layers)
+            ]
+        )
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.num_layers = len(self.layers)
+
+    def _get_layer(self, layer_number: int):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor = None,
+        context_mask: Tensor = None,
+        rotary_pos_emb: Tensor = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """Forward method with activation checkpointing."""
+
+        def custom(start: int, end: int):
+            def custom_forward(
+                hidden_states,
+                attention_mask,
+                context,
+                context_mask,
+                rotary_pos_emb,
+                packed_seq_params,
+            ):
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    hidden_states, context = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        context=context,
+                        context_mask=context_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=None,
+                        packed_seq_params=packed_seq_params,
+                    )
+                return hidden_states, context
+
+            return custom_forward
+
+        def checkpoint_handler(forward_func):
+            if self.config.fp8:
+                return te_checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    parallel_state.get_tensor_model_parallel_group(),
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                    packed_seq_params,
+                )
+            else:
+                return tensor_parallel.checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                    packed_seq_params,
+                )
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers:
+                hidden_states, context = checkpoint_handler(custom(l, l + self.config.recompute_num_layers))
+
+                l += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            recompute_skip_num_layers = 0
+            for l in range(self.num_layers):
+                # Skip recomputation when input grad computation is not needed.
+                # Need to have at least one input tensor with gradient computation
+                # for re-enterant autograd engine.
+                if self.config.fp8 and not hidden_states.requires_grad:
+                    recompute_skip_num_layers += 1
+                if l >= recompute_skip_num_layers and l < self.config.recompute_num_layers + recompute_skip_num_layers:
+                    hidden_states, context = checkpoint_handler(custom(l, l + 1))
+                else:
+                    hidden_states, context = custom(l, l + 1)(
+                        hidden_states,
+                        attention_mask,
+                        context,
+                        context_mask,
+                        rotary_pos_emb,
+                        packed_seq_params,
+                    )
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb):
+
+        if (
+            self.config.recompute_granularity == 'full'
+            and self.training
+            and not self.config.activations_checkpoint_recurrent
+        ):
+            hidden_states = self._checkpointed_forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+        else:
+            for layer in self.layers:
+
+                hidden_states, _ = layer(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py
new file mode 100755
index 000000000000..a504898e9d64
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.recurrent_layer import (
+    RecurrentBlock,
+    RecurrentBlockSubmodules,
+)
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.recurrent_module import (
+    RGLRU,
+    Conv1D,
+    RecurrentLayer,
+    RecurrentLayerSubmodules,
+)
+
+griffin_mqa_layer_with_transformer_engine_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TELayerNormColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
+
+griffin_recurrent_layer_with_transformer_engine_spec = ModuleSpec(
+    module=RecurrentBlock,
+    submodules=RecurrentBlockSubmodules(
+        recurrent_layer=ModuleSpec(
+            module=RecurrentLayer,
+            submodules=RecurrentLayerSubmodules(
+                linear_in=TELayerNormColumnParallelLinear,
+                linear_out=TERowParallelLinear,
+                conv_1d=Conv1D,
+                rg_lru=RGLRU,
+            ),
+        ),
+        recurrent_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
new file mode 100755
index 000000000000..7a327a3a35cb
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.jit import jit_fuser
+    from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+    from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+    from megatron.core.models.common.language_module.language_module import LanguageModule
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from torch import Tensor, nn
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_block import GriffinStack
+
+
+class GriffinModel(LanguageModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int = 256000,
+        logits_soft_cap: float = 30.0,
+        position_embedding_type: str = 'rope',
+        max_sequence_length: int = 1024,
+        rotary_percent: float = 0.5,
+        rotary_base: int = 10000,
+        pre_process: bool = True,
+        post_process: bool = True,
+        share_embeddings_and_output_weights: bool = True,
+    ):
+
+        super().__init__(config)
+        self.config = config
+        self.vocab_size = vocab_size
+        self.logits_soft_cap = logits_soft_cap
+        self.position_embedding_type = position_embedding_type
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+        if pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=max_sequence_length,
+                position_embedding_type=None,
+            )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=config.rotary_interleaved,
+                seq_len_interpolation_factor=None,
+                rotary_base=rotary_base,
+            )
+
+        self.decoder = GriffinStack(self.config)
+
+        if self.post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=None,
+                grad_output_buffer=None,
+            )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
+
+        Returns:
+            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
+        """
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def set_input_tensor(self, input_tensor: Tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def griffin_position_ids(self, token_ids):
+        # Create position ids
+        seq_length = token_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+        return position_ids
+
+    def embedding_forward(self, input_ids):
+
+        position_ids = self.griffin_position_ids(input_ids)
+        embeddings = self.embedding(input_ids, position_ids)
+        embeddings = embeddings * torch.tensor(math.sqrt(self.config.hidden_size)).type_as(embeddings)
+
+        return embeddings
+
+    @jit_fuser
+    def _embedding_decode_(self, logits, transpose):
+        logits = nn.functional.tanh(logits / self.logits_soft_cap) * self.logits_soft_cap
+        if transpose:
+            logits = logits.transpose(0, 1)
+        return logits.contiguous()
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        labels: Tensor = None,
+        **extra_arg,
+    ):
+        if input_ids is None:
+            input_ids = self.input_tensor
+
+        hidden_states = self.embedding_forward(input_ids)
+
+        rotary_pos_emb = None
+        self.decoder.input_tensor = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(None, self.decoder, hidden_states, self.config)
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        hidden_states = self.decoder(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+        logits = self._embedding_decode_(logits, labels is None)
+
+        if labels is None:
+            # [b s h]
+            return logits
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
new file mode 100755
index 000000000000..3a33f8966fd2
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Union
+from torch import Tensor
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.utils import make_viewless_tensor
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+
+@dataclass
+class RecurrentBlockSubmodules:
+    input_layernorm: Union[ModuleSpec, type] = IdentityOp
+    recurrent_layer: Union[ModuleSpec, type] = IdentityOp
+    recurrent_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
+    mlp: Union[ModuleSpec, type] = IdentityOp
+    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+
+class RecurrentBlock(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: RecurrentBlockSubmodules,
+        layer_idx=None,
+        residual_in_fp32=False,
+        **kwargs,
+    ):
+        """
+        Top level Mamba Layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.hidden_dropout = config.hidden_dropout
+
+        self.input_layernorm = build_module(submodules.input_layernorm, dim=self.config.hidden_size)
+
+        self.recurrent_layer = build_module(
+            submodules.recurrent_layer,
+            self.config,
+            width=self.config.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            lru_width=self.config.hidden_size,
+            conv1d_temporal_width=4,
+            final_w_init_variance_scale=1.0,
+        )
+
+        self.recurrent_bda = build_module(submodules.recurrent_bda)
+
+        self.pre_mlp_layernorm = build_module(submodules.pre_mlp_layernorm, dim=self.config.hidden_size)
+
+        self.mlp = build_module(submodules.mlp, config=self.config)
+
+        self.mlp_bda = build_module(submodules.mlp_bda)
+
+    def forward(self, hidden_states: Tensor, attention_mask: Tensor, inference_params=None, **kwargs):
+
+        residual = hidden_states
+
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Reccurent block.
+        recurrent_output_with_bias = self.recurrent_layer(input_layernorm_output)
+
+        hidden_states = self.recurrent_bda(self.training, self.config.bias_dropout_fusion)(
+            recurrent_output_with_bias, residual, self.hidden_dropout
+        )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+            mlp_output_with_bias, residual, self.hidden_dropout
+        )
+
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, None
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
new file mode 100755
index 000000000000..033d3abec732
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+import torch._dynamo
+from accelerated_scan.triton import scan
+from causal_conv1d import causal_conv1d_fn
+from einops import rearrange
+from torch import nn
+
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+    from megatron.core.jit import jit_fuser
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+torch._dynamo.config.suppress_errors = True
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+class BlockDiagonalLinear(nn.Module):
+    """Block-diagonal linear layer."""
+
+    def __init__(
+        self,
+        width: int,
+        num_blocks: int,
+        w_init_variance_scale: float = 1.0,
+    ):
+        """Initializes the BlockDiagonalLinear.
+
+        Args:
+          width: The number of dimensions of the input and output.
+          num_blocks: The number of diagonal blocks in the layer.
+          w_init_variance_scale: A parameters that scales the variance of the
+            initialization of the weights.
+        """
+        super().__init__()
+        self.width = width
+        self.num_blocks = num_blocks
+        self.w_init_variance_scale = w_init_variance_scale
+        self.block_width = self.width // self.num_blocks
+
+        # Parameters.
+        self.w = nn.Parameter(torch.zeros([self.num_blocks, self.block_width, self.block_width]))
+        self.b = nn.Parameter(torch.zeros([self.num_blocks, self.block_width]))
+
+        # Initialization.
+        self.w_init_(self.w)
+
+    def w_init_(self, w: torch.Tensor) -> None:
+        """Initializes the weight `w` of the layer."""
+        std = math.sqrt(self.w_init_variance_scale / self.block_width)
+        torch.nn.init.normal_(w, mean=0.0, std=std)
+
+    @jit_fuser
+    def _fused_pre_reshape_(self, x, bs, seq_l):
+        x = (
+            x.reshape(bs, seq_l, self.num_blocks, self.block_width)
+            .permute(2, 0, 1, 3)
+            .reshape(self.num_blocks, bs * seq_l, self.block_width)
+        )
+        return x
+
+    @jit_fuser
+    def _post_add_reshape_sigmoid_(self, x, bs, seq_l):
+        x = (x.permute(1, 0, 2) + self.b).reshape(bs, seq_l, self.num_blocks * self.block_width)
+        x = torch.sigmoid(x)
+        return x
+
+    def forward(self, x):
+        """Calls the BlockDiagonalLinear."""
+        # Split x to blocks.
+        bs, seq_l = x.shape[0], x.shape[1]
+        x = self._fused_pre_reshape_(x, bs, seq_l)
+
+        x = torch.bmm(x, self.w)
+        x = self._post_add_reshape_sigmoid_(x, bs, seq_l)
+
+        return x
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+@jit_fuser
+def _scan_preprocess_(x, gate_a, gate_x, reset, a_params):
+
+    log_a = -8.0 * gate_a * nn.functional.softplus(a_params)
+    a = torch.exp(log_a)
+    gated_x = x * gate_x
+    multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
+    multiplier = reset + (1 - reset) * multiplier
+    x = gated_x * multiplier.type(x.dtype)
+
+    assert x.ndim == 3
+    assert a.shape == x.shape[-a.ndim :]
+    assert a.dtype == x.dtype
+    assert type(a) is type(x)
+
+    # Multiply `a` by the reset.
+    a = a * (1 - reset)
+
+    # Using scan in linear mode.
+    x = x.permute(0, 2, 1)
+    a = a.permute(0, 2, 1)
+    x = x.contiguous()
+    a = a.contiguous()
+
+    return a, x
+
+
+def rnn_scan(
+    x,
+    gate_a,
+    gate_x,
+    reset,
+    a_params,
+    # x, a, reset,
+):
+    """Runs the recurrence of a linear RNN.
+
+    Args:
+      x: The input sequence.
+      a: The diagonal of the recurrence matrix `A`.
+      reset: Indicator of document boundaries, e.g. when to reset the hidden
+        state of the RNN.
+      h0: The initial hidden state.
+
+    Returns:
+      The output of the linear recurrence.
+    """
+
+    a, x = _scan_preprocess_(x, gate_a, gate_x, reset, a_params)
+
+    y = scan(a.float(), x.float()).type_as(x)
+
+    y = y.permute(0, 2, 1)
+
+    return y, None
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+def rnn_param_init(
+    *,
+    width: int,
+    min_rad: float,
+    max_rad: float,
+    transform: str = "softplus",
+) -> torch.Tensor:
+    """Initializes the `A` parameter of the RG-LRU uniformly on a ring."""
+    unif = torch.rand(width)
+    # Proportional to area in a ring.
+    a_real = 0.5 * torch.log(unif * (max_rad**2 - min_rad**2) + min_rad**2 + 1e-8)
+
+    if transform == "softplus":
+        # Inverse transform.
+        return torch.log(torch.exp(-a_real) - 1.0)
+    else:
+        raise NotImplementedError()
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+class RGLRU(nn.Module):
+    """A Real-Gated Linear Recurrent Unit (RG-LRU) layer."""
+
+    def __init__(
+        self,
+        width: int,
+        num_heads: int,
+        w_init_variance_scale: float = 1.0,
+    ):
+        """Initializes the RG-LRU.
+
+        Args:
+          width: The number of dimensions of the input and output.
+          num_heads: The number of diagonal blocks in the input and A gate layers.
+          w_init_variance_scale: Initialization parameter for the
+            BlockDiagonalLinear layers of the gates. See the `BlockDiagonalLinear`
+            layer for details.
+        """
+        super().__init__()
+        self.width = width
+        self.num_heads = num_heads
+        self.w_init_variance_scale = w_init_variance_scale
+
+        # Parameters and layers.
+        self.a_param = nn.Parameter(self.a_param_init)
+        self.input_gate = BlockDiagonalLinear(
+            width=self.width,
+            num_blocks=self.num_heads,
+            w_init_variance_scale=w_init_variance_scale,
+        )
+        self.a_gate = BlockDiagonalLinear(
+            width=self.width, num_blocks=self.num_heads, w_init_variance_scale=self.w_init_variance_scale
+        )
+
+    @property
+    def a_param_init(self) -> torch.Tensor:
+        """Initializes the `A` parameter of the RG-LRU."""
+        return rnn_param_init(width=self.width, min_rad=0.9, max_rad=0.999)
+
+    @jit_fuser
+    def _fused_pst_gates_(self, x, gate_a, gate_x, reset):
+
+        log_a = -8.0 * gate_a * nn.functional.softplus(self.a_param)
+        a = torch.exp(log_a)
+        gated_x = x * gate_x
+        multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
+        multiplier = reset + (1 - reset) * multiplier
+        normalized_x = gated_x * multiplier.type(x.dtype)
+
+        return normalized_x, a
+
+    def __call__(
+        self,
+        x,
+        segment_pos,
+        prev_h,
+    ):
+        """Calls the RG-LRU.
+
+        Args:
+          x: Sequence of input activations.
+          segment_pos: Position of each token in the sequence.
+          prev_h: The previous hidden state of the RG-LRU.
+
+        Returns:
+          Output of the block together with the updated hidden state.
+        """
+
+        for param in self.parameters():
+            param.data_ptr()
+
+        bs, l, d = x.shape
+        assert segment_pos.shape == (bs, l)
+        reset = (segment_pos == 0).type(torch.int32).unsqueeze(-1)
+
+        # Gates for x and a.
+        gate_x = self.input_gate(x)
+        gate_a = self.a_gate(x)
+
+        y, last_h = rnn_scan(x, gate_a, gate_x, reset, self.a_param)
+
+        return y, last_h
+
+
+class Conv1D(MegatronModule):
+    def __init__(self, config, width, temporal_width):
+        super().__init__(config=config)
+        self.config = config
+        self.width = width
+        self.temporal_width = temporal_width
+        self.conv_1d = nn.Conv1d(
+            in_channels=width,
+            out_channels=width,
+            bias=True,
+            kernel_size=temporal_width,
+            groups=width,
+            padding=temporal_width - 1,
+        )
+
+    def forward(
+        self,
+        x,
+        segment_pos=None,
+        prev_x=None,
+    ):
+        x = x.permute(0, 2, 1)
+        output = causal_conv1d_fn(
+            x=x,
+            weight=rearrange(self.conv_1d.weight, "d 1 w -> d w"),
+            bias=self.conv_1d.bias,
+            activation=None,
+        ).permute(0, 2, 1)
+        return output, None
+
+
+@dataclass
+class RecurrentLayerSubmodules:
+    linear_in: Union[ModuleSpec, type] = IdentityOp
+    linear_out: Union[ModuleSpec, type] = IdentityOp
+    conv_1d: Union[ModuleSpec, type] = IdentityOp
+    rg_lru: Union[ModuleSpec, type] = IdentityOp
+
+
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    """Returns the GELU activation function with the same approximation as JAX."""
+    return nn.functional.gelu(x, approximate="tanh")
+
+
+@jit_fuser
+def _fused_permute_add_(x, b):
+    x = x + b
+    x = x.permute(1, 0, 2)
+    return x
+
+
+@jit_fuser
+def _fused_permute_mult_(x, y):
+    x = x.permute(1, 0, 2)
+    x = x * y
+    return x
+
+
+class RecurrentLayer(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: RecurrentLayerSubmodules,
+        layer_idx=None,
+        residual_in_fp32=False,
+        **kwargs,
+    ):
+        """
+        Top level Recurrent Layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.linear_in = build_module(
+            submodules.linear_in,
+            self.config.hidden_size,
+            self.config.hidden_size * 2,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=False,
+        )
+
+        self.linear_out = build_module(
+            submodules.linear_out,
+            self.config.hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=False,
+            input_is_parallel=True,
+        )
+
+        self.conv_1d = build_module(
+            submodules.conv_1d, config=self.config, width=self.config.hidden_size, temporal_width=4
+        )
+
+        self.rg_lru = build_module(
+            submodules.rg_lru, width=self.config.hidden_size, num_heads=self.config.num_attention_heads
+        )
+
+    def checkpoint_handler(self, forward_func, x, segment_pos, prev_x):
+        return tensor_parallel.checkpoint(
+            forward_func, self.config.distribute_saved_activations, x, segment_pos, prev_x
+        )
+
+    def forward(self, hidden_states, attention_mask=None, rotary_pos_emb=None):
+
+        segment_pos = torch.arange(hidden_states.shape[0]).unsqueeze(0).repeat(hidden_states.shape[1], 1).cuda()
+        in_intermidiate_parallel, in_bias_parallel = self.linear_in(hidden_states)
+
+        x_bias_parallel, y_bias_parallel = in_bias_parallel.chunk(2, dim=-1)
+        x_intermidiate_parallel, y_intermidiate_parallel = in_intermidiate_parallel.chunk(2, dim=-1)
+
+        y = bias_gelu_impl(y_intermidiate_parallel, y_bias_parallel)
+
+        x = _fused_permute_add_(x_intermidiate_parallel, x_bias_parallel)
+
+        if self.config.activations_checkpoint_recurrent and self.training:
+            x, _ = self.checkpoint_handler(self.conv_1d, x=x, segment_pos=segment_pos, prev_x=None)
+            x, _ = self.checkpoint_handler(self.rg_lru, x=x, segment_pos=segment_pos, prev_x=None)
+
+        else:
+            x, _ = self.conv_1d(x=x, segment_pos=segment_pos, prev_x=None)
+            x, _ = self.rg_lru(x=x, segment_pos=segment_pos, prev_h=None)
+
+        x = _fused_permute_mult_(x, y)
+
+        x_intermidiate_parallel, x_bias_parallel = self.linear_out(x)
+
+        return x_intermidiate_parallel, x_bias_parallel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index f431d43716b9..29f3e8905f91 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -59,6 +59,7 @@
 
 try:
     from megatron.core import ModelParallelConfig, parallel_state
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -147,7 +148,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
         # set the megatron core model parallel config
         self.model_parallel_config: ModelParallelConfig = self.build_model_parallel_config()
 
-        self.with_distributed_adam = cfg.optim.get('name') == 'distributed_fused_adam'
+        self.use_mcore_dist_optim = cfg.optim.get('name') == 'mcore_distributed_optim'
+        self.with_distributed_adam = cfg.optim.get('name') == 'distributed_fused_adam' or self.use_mcore_dist_optim
         self.with_megatron_fused_adam = cfg.optim.get('name') == 'megatron_fused_adam'
 
         # used in NVIDIA NGC PyTorch containers
@@ -204,6 +206,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False),
             seed=self.cfg.get('seed', 1234),
             apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
+            use_te_rng_tracker=self.cfg.get('use_te_rng_tracker', False),
         )
 
         # This must be called after initialize model parallel since it needs to know the data parallel size
@@ -244,12 +247,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
         self.use_fsdp = cfg.get('fsdp', False)
 
     def setup_transformer_engine_tp_groups(self):
-        """ This should be called after model parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after model parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         for module in self.get_model_module_list():
             """Set TP group
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -260,14 +263,14 @@ def setup_transformer_engine_tp_groups(self):
                     child.set_tensor_parallel_group(tp_group)
 
     def setup_transformer_engine_cp_groups(self):
-        """ This should be called after context parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after context parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         cp_stream = torch.cuda.Stream()
 
         for module in self.get_model_module_list():
             """Set context parallel running
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -281,11 +284,11 @@ def setup_transformer_engine_cp_groups(self):
                     )
 
     def _wrap_model_for_O2(self):
-        """ Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
-            Args:
-                model: The model to wrap. Can be a list of modules or a single module.
-            Returns:
-                The wrapped model. Returns a list of wrapped modules or a single wrapped module.
+        """Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
+        Args:
+            model: The model to wrap. Can be a list of modules or a single module.
+        Returns:
+            The wrapped model. Returns a list of wrapped modules or a single wrapped module.
         """
         is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
 
@@ -301,7 +304,6 @@ def _wrap_model_for_O2(self):
         }
 
         args = mcore_args if is_mcore_model else nemo_args
-
         # Model wrapper to convert both model and inputs to half precision
         if isinstance(self.model, list):
             converted_model = []
@@ -312,13 +314,12 @@ def _wrap_model_for_O2(self):
         else:
             args['module'] = self.model
             self.model = Float16Wrapper(**args)
-
         args.pop('module')
 
     def get_model_module_list(self):
         if isinstance(self.model, list):
             return [
-                model.module if isinstance(model, (Float16Module, MCoreFloat16Module)) else model
+                model.module if isinstance(model, (Float16Module, MCoreFloat16Module, McoreDDP)) else model
                 for model in self.model
             ]
         elif isinstance(self.model, (Float16Module, MCoreFloat16Module)):
@@ -420,7 +421,7 @@ def _build_tokenizer(self):
             legacy = True if self._cfg.tokenizer.library == 'sentencepiece' else False
         self.tokenizer = get_nmt_tokenizer(
             library=self._cfg.tokenizer.library,
-            model_name=self._cfg.tokenizer.type,
+            model_name=self._cfg.tokenizer.get("type", None),
             tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.get('model', None)),
             vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.get('vocab_file', None)),
             merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.get('merge_file', None)),
@@ -450,10 +451,10 @@ def on_validation_end(self) -> None:
             gc.collect()
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
         # create a dictionary copy of the model config
@@ -509,7 +510,6 @@ def build_transformer_config(self) -> TransformerConfig:
 
         bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)
 
-        # @chcui default rope fusion to false until #8590 is closed.
         apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)
 
         # TODO: need to check if recompute APIs are matching up properly
@@ -602,7 +602,7 @@ def get_parameters_with_grad(self):
 
     def configure_gradient_clipping(self, *args, **kwargs):
         """PTL hook to configure gradients.
-           We use gradient clipping implementation from megatron-lm.
+        We use gradient clipping implementation from megatron-lm.
         """
         clip_val = self.trainer.gradient_clip_val
         if clip_val is None:
@@ -612,7 +612,7 @@ def configure_gradient_clipping(self, *args, **kwargs):
         if clip_val <= 0:
             return
 
-        if self.with_megatron_fused_adam:
+        if self.with_megatron_fused_adam or self.use_mcore_dist_optim:
             # Gradient clipping is done in optimizer step
             return
 
@@ -628,13 +628,17 @@ def configure_gradient_clipping(self, *args, **kwargs):
                 parameters = self._optimizer.get_parameters_with_grad()
             else:
                 parameters = self.get_parameters_with_grad()
-            grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=self.use_fsdp,)
+            grad_norm = clip_grad_norm_fp32(
+                parameters=parameters,
+                max_norm=clip_val,
+                use_fsdp=self.use_fsdp,
+            )
 
         self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1)
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-           Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -733,7 +737,9 @@ def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloade
             self.validation_global_step += 1
 
     def setup_optimization(
-        self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        optim_config: Optional[Union[DictConfig, Dict]] = None,
+        optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         # Ensure `max_steps` is set correctly
         optim_config = self._optim_config_copy(optim_config)
@@ -773,18 +779,21 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
             model_dtype = torch.float32
             if self.megatron_amp_O2 and hasattr(self, 'autocast_dtype'):
                 model_dtype = self.autocast_dtype
-            optim_kwargs['param_sync_dtype'] = model_dtype
+            # Don't override user desired value
+            if 'param_sync_dtype' not in optim_config:
+                optim_kwargs['param_sync_dtype'] = model_dtype
 
             # Determine whether to store master params in optimizer
-            if self.cfg.get('fp8_params', False):
-                optim_kwargs['store_params'] = True
-            elif optim_dtype == model_dtype:
-                optim_kwargs['store_params'] = False
-            elif optim_dtype == torch.float32 and model_dtype == torch.bfloat16:
-                optim_kwargs['store_params'] = False
-                optim_kwargs['store_param_remainders'] = True
-            else:
-                optim_kwargs['store_params'] = True
+            if 'store_params' not in optim_config:
+                if self.cfg.get('fp8_params', False):
+                    optim_kwargs['store_params'] = True
+                elif optim_dtype == model_dtype:
+                    optim_kwargs['store_params'] = False
+                elif optim_dtype == torch.float32 and model_dtype == torch.bfloat16:
+                    optim_kwargs['store_params'] = False
+                    optim_kwargs['store_param_remainders'] = True
+                else:
+                    optim_kwargs['store_params'] = True
 
         return super().setup_optimization(optim_config=optim_config, optim_kwargs=optim_kwargs)
 
@@ -847,7 +856,7 @@ def configure_optimizers(self):
             )
 
         # Configure distributed optimizer
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Initialize param buckets if explicitly provided
             if getattr(self, 'distributed_adam_buckets', None) is not None:
@@ -914,8 +923,8 @@ def _extract_consumed_samples_from_ckpt(self, ckpt_path):
         return init_consumed_samples
 
     def _validate_and_override_config(self):
-        """ Certain configurations might be incompatible or discouraged.
-            We can check for them here and override if necessary.
+        """Certain configurations might be incompatible or discouraged.
+        We can check for them here and override if necessary.
         """
         app_state = AppState()
 
@@ -930,7 +939,10 @@ def _validate_and_override_config(self):
         # async grad allreduce. This should be fixed!
         # For now we must disable it whenever using the baseline implementaion.
         # The distributed adam from apex does work with gradient accumulation fusion.
-        distributed_fused_adam = self.cfg.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
+        distributed_fused_adam = (
+            self.cfg.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
+            or self.cfg.optim.get('name', 'fused_adam') == 'mcore_distributed_optim'
+        )
         pipeline_model_parallel_size = self.cfg.get('pipeline_model_parallel_size', 1)
         data_parallel_size = app_state.data_parallel_size
 
@@ -1091,9 +1103,9 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
         return num_parameters_on_device, total_num_parameters
 
     def build_model_parallel_config(self) -> ModelParallelConfig:
-        """ For attributes in the nemo model config that are the same as the
-            megatron core ModelParallelConfig we will use the value from the nemo config.
-            For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
+        """For attributes in the nemo model config that are the same as the
+        megatron core ModelParallelConfig we will use the value from the nemo config.
+        For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
         """
         cfg = OmegaConf.to_container(self.cfg, resolve=True)
 
@@ -1114,9 +1126,9 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
             and not self.cfg.get('sequence_parallel', False),
             "pipeline_dtype": pipeline_dtype,
-            "grad_scale_func": self.trainer.precision_plugin.scaler.scale
-            if self.trainer.precision in ["16", "16-mixed"]
-            else None,
+            "grad_scale_func": (
+                self.trainer.precision_plugin.scaler.scale if self.trainer.precision in ["16", "16-mixed"] else None
+            ),
             "enable_autocast": not megatron_amp_O2 and self.torch_dtype in [torch.bfloat16, torch.float16],
             "autocast_dtype": self.autocast_dtype,
             "variable_seq_lengths": False,  # set dynamically during training
@@ -1228,7 +1240,7 @@ def find_frozen_submodules(model):
             return frozen_submodule_names, frozen_submodules
 
         if self.use_fsdp:
-            """ Top-evel FSDP model sharding """
+            """Top-evel FSDP model sharding"""
             # Shard the top-level model hierarchically. We shard the strategy-unwrapped model not
             # to lose the structure of non-FSDP wrapped parameters (e.g, embedding)
             # TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 4d4cc09d0751..d151925635ab 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -63,26 +63,29 @@
 
 class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration):
     """
-    Model class for prompt-tuning or p-tuning a pretrained Megatron model. 
+    Model class for prompt-tuning or p-tuning a pretrained Megatron model.
 
     Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
     certain token embeddings from the the pretrained model's vocabulary
-    and directly tunes these embedding weights. The token embeddings used in 
-    initalization are specified by the user in the config file. The model can 
-    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a 
-    prompt table and can be added or deleted without disrupting virtual prompts 
-    for other tasks. 
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
 
     P-tuning initializes an LSTM encoder model that generates virtual prompt
     embeddings for every task. Each task shares the same encoder. After ptuning
     is compelete, the learned virtual prompts can be saved to the prompt table
-    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a 
-    new virtual prompt via p-tuning, they do not need to retrain on all previous 
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
     tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         super().__init__(cfg, trainer)
+        self.init_model(cfg, trainer)
+
+    def init_model(self, cfg: DictConfig, trainer: Trainer):
 
         self.config: ModelParallelConfig = self.model_parallel_config
 
@@ -156,10 +159,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
     def load_task_templates(self, task_templates):
         """
-        Takes in the task template portion of the config and turns  
-        it into a table where each task's prompt template and 
-        the number of virtual tokens to insert in a given part of 
-        the prompt template are specified. 
+        Takes in the task template portion of the config and turns
+        it into a table where each task's prompt template and
+        the number of virtual tokens to insert in a given part of
+        the prompt template are specified.
         """
         self.task_templates = {}
         self.task_id_num_to_name = {}
@@ -215,18 +218,17 @@ def init_prompt_encoder(self):
         )
 
     def freeze_existing_word_embeddings(self):
-        """Freeze params of existing virtual prompts that should not be tuned further
-        """
+        """Freeze params of existing virtual prompts that should not be tuned further"""
         # Make sure word embeddings are frozen
         for params in self.word_embeddings.parameters():
             params.requires_grad = False
 
     def state_dict(self):
         """
-        Custom state dict that only contains prompt table and prompt encoder parameters. 
-        No frozen model parameters are stored in the state dict. Prompt encoder parameters 
+        Custom state dict that only contains prompt table and prompt encoder parameters.
+        No frozen model parameters are stored in the state dict. Prompt encoder parameters
         are only in state dict for intermediate checkpoints saved during training. Final
-        nemo checkpoints at the end of training will contain prompt table parameters only. 
+        nemo checkpoints at the end of training will contain prompt table parameters only.
         """
         state_dict_ = {}
 
@@ -241,7 +243,7 @@ def state_dict(self):
     def load_state_dict(self, state_dict, strict: bool = True):
         """
         Custom load state dict method that only loads prompt table and prompt encoder
-        parameters. Matching load method for this class' custom state dict method. 
+        parameters. Matching load method for this class' custom state dict method.
         """
         if self.first_stage_of_pipeline():
             if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
@@ -253,7 +255,7 @@ def load_state_dict(self, state_dict, strict: bool = True):
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override. Optimizer will get self._optimizer_param_groups. 
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
         Only want virtual prompt params to be passed to the optimizer.
         """
         ## Freeze frozen model
@@ -272,8 +274,8 @@ def setup_optimizer_param_groups(self):
 
     def embed_input(self, input_ids: Tensor, taskname_ids: Tensor, use_cached_reps: bool):
         """
-        Replaces the virtual tokens in the input_ids with embeddings 
-        calculated from either the 'prompt_table' or 'prompt_encoder'. 
+        Replaces the virtual tokens in the input_ids with embeddings
+        calculated from either the 'prompt_table' or 'prompt_encoder'.
         The virtual token placeholders have token_ids listed in
         `self.pseudo_token_ids`.
 
@@ -422,7 +424,7 @@ def load_frozen_model(self, cfg, trainer):
 def get_pseudo_tokens(num_virtual_tokens):
     """
     Takes in an integer and returns a list of strings where each string
-    is a numbered virtual token placeholder. If 
+    is a numbered virtual token placeholder. If
     num_virtual_tokens = 3, then this function returns:
 
     ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
@@ -430,7 +432,7 @@ def get_pseudo_tokens(num_virtual_tokens):
     Args:
         num_virtual_tokens: (int) Number of virtual token strings you want to make
 
-    returns a list of string. 
+    returns a list of string.
 
     """
     pseudo_tokens = [
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index dc6d81649122..984fca5f1259 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -136,12 +136,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             # Model wrapper to convert both model and inputs to half precision
             self._wrap_model_for_O2()
 
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
 
     def model_provider_func(self, pre_process, post_process):
         cfg = self.cfg
@@ -760,9 +764,11 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c2e1f0ed48b7..718991dc203d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -41,7 +41,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
-from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import get_gpt_layer_modelopt_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -89,8 +89,11 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.datasets.utils import get_blend_from_list
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
     from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.distributed import DistributedDataParallelConfig, finalize_model_grads
 
     # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
     # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
@@ -102,7 +105,12 @@
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
-    from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal
+    from megatron.core.utils import (
+        drain_embedding_wgrad_compute,
+        get_model_config,
+        init_method_normal,
+        scaled_init_method_normal,
+    )
 
     HAVE_MEGATRON_CORE = True
 
@@ -146,7 +154,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
         "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
-        "ammo": get_gpt_layer_ammo_spec(),
+        "modelopt": get_gpt_layer_modelopt_spec(),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
@@ -166,7 +174,7 @@ def forward(self, **kwargs):
         the superclass by the square root of the hidden size specified in the configuration.
         """
         embeddings = super().forward(**kwargs)
-        return embeddings * torch.tensor(self.config.hidden_size ** 0.5, dtype=embeddings.dtype)
+        return embeddings * torch.tensor(self.config.hidden_size**0.5, dtype=embeddings.dtype)
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
@@ -188,11 +196,14 @@ def __init__(self, model):
 
     def forward(self, tokens, position_ids, attention_mask):
         if self.fp8_enabled and HAVE_TE:
-            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
-                enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
-            ), torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                transformer_engine.pytorch.onnx_export(self.fp8_enabled),
+                transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe),
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -203,9 +214,12 @@ def forward(self, tokens, position_ids, attention_mask):
                     labels=None,
                 )
         else:
-            with torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -302,9 +316,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if not self.megatron_amp_O2 and self.cfg.get('expert_model_parallel_size', 1) > 1:
             raise ValueError('Expert parallelism is only supported when using megatron_amp_O2')
 
-        # TODO(akoumparouli): this is temporary and will be removed in the future.
         if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam:
-            raise ValueError('Expert parallelism is currently not supporting distributed optimizer')
+            if not self.use_mcore_dist_optim:
+                raise ValueError(
+                    'Expert parallelism is currently not supporting Apex distributed optimizer, use Mcore distributed optimizer instead'
+                )
 
         self.transformer_engine = cfg.get('transformer_engine', False)
         if self.megatron_amp_O2 and not self.transformer_engine:
@@ -327,11 +343,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                     model_provider_func=self.model_provider_func,
                     wrap_with_ddp=False,
                     virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-                    on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
+                    on_cpu=cfg.get('use_cpu_initialization', False),
                 )
 
         # if we're not using interleaved, then self.model is a module.
-        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None and (not self.use_mcore_dist_optim):
             self.model = self.model[0]
 
         if self.megatron_amp_O2:
@@ -354,13 +370,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self._inference_config = None
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             cp_size = cfg.get('context_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // (mp_size * cp_size)
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
 
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
@@ -494,11 +514,42 @@ def setup_optimizer_param_groups(self):
         else:
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel"""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+            self.model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.
+
     def configure_optimizers(self):
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Special handling for embedding grads
+            with_fp32_embedding_grads = self.cfg.get('with_fp32_embedding_grads', True)
             modules = self.get_model_module_list()
             if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
                 module = modules[0]  # first virtual rank has the embeddings
@@ -508,7 +559,7 @@ def configure_optimizers(self):
                 word_embeddings = (
                     module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
                 )
-                word_embeddings._with_fp32_optimizer = True
+                word_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
                 if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.get(
                     'share_embeddings_and_output_weights', True
                 ):
@@ -523,7 +574,7 @@ def configure_optimizers(self):
                 else:
                     position_embeddings = module.position_embeddings_weight()
                 if position_embeddings is not None:
-                    position_embeddings._with_fp32_optimizer = True
+                    position_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
 
             # Handle case where embeddings are used in output layer
             if parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.cfg.get(
@@ -533,7 +584,7 @@ def configure_optimizers(self):
                 word_embeddings = (
                     module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
                 )
-                word_embeddings._with_fp32_optimizer = True
+                word_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
                 if parallel_state.get_pipeline_model_parallel_world_size() > 1:
                     word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
                     word_embeddings._disable_overlap_grad_sync = True
@@ -596,16 +647,35 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             if forward_only:
                 if self.validation_param_sync_overlap:
                     param_sync_func = self.sync_overlap_parameters
-            else:
-                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+            elif not self.use_mcore_dist_optim:
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
+            else:
+                if self.cfg.optim.get("overlap_grad_sync", False):
+                    no_sync_func = [model_chunk.no_sync for model_chunk in self.model]
+                    no_sync_func = no_sync_func[0] if len(self.model) == 1 else no_sync_func
+
+                    if self.cfg.optim.get("delay_grad_reduce", True):
+                        grad_sync_func = [model_chunk.start_grad_sync for model_chunk in self.model]
+                        grad_sync_func = grad_sync_func[0] if len(self.model) == 1 else grad_sync_func
+                if self.cfg.optim.get("overlap_param_sync", False) and self.cfg.optim.get("delay_param_gather", False):
+                    param_sync_func = [
+                        lambda x, model_index=model_index: self._optimizer.finish_param_sync(model_index, x)
+                        for model_index in range(len(self.model))
+                    ]
+                    param_sync_func = param_sync_func[0] if len(self.model) == 1 else param_sync_func
 
         # pipeline schedules will get these from self.model.config
         for module in self.get_model_module_list():
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
             module.config.param_sync_func = param_sync_func
+            if self.use_mcore_dist_optim:
+                module.config.finalize_model_grads_func = finalize_model_grads
 
         # run forward and backwards passes for an entire global batch
         # we do this inside training_step to support pipeline parallelism
@@ -684,9 +754,9 @@ def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -699,10 +769,15 @@ def training_step(self, dataloader_iter):
             if self.prev_global_batch_size != current_global_batch_size and self.prev_global_batch_size:
                 self.trainer.should_stop = True
 
+        # zero out the mcore grad buf
+        if self.use_mcore_dist_optim:
+            for model_chunk in self.model:
+                model_chunk.zero_grad_buffer()
+
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
             # hack to enable overlapping param sync and forward compute
             # note: the distributed optimizer monkey-patches each
             # parameter's __getattribute__ function so that it can
@@ -769,22 +844,30 @@ def training_step(self, dataloader_iter):
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
-            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
-            self.allreduce_sequence_parallel_gradients()
-            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
+            # Mcore DistOpt handles this, so we don't have to
+            if not self.use_mcore_dist_optim:
+                self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
+                self.allreduce_sequence_parallel_gradients()
+                self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
 
         self.megatron_timer_start('gradient_allreduce', log_level=1)
         if self.use_fsdp:
             # Reduce the gradients omitted from FSDP-sharding
             self.allreduce_fsdp_sharding_omitted_gradients()
         elif self.with_distributed_adam:
-            # synchronize asynchronous grad reductions
-            # note: not necessary, but reduces performance degradation
-            # from multiple simultaneous NCCL calls
-            self._optimizer._finish_bucket_grad_sync()
+            if not self.use_mcore_dist_optim:
+                # synchronize asynchronous grad reductions
+                # note: not necessary, but reduces performance degradation
+                # from multiple simultaneous NCCL calls
+                self._optimizer._finish_bucket_grad_sync()
+            # else: Mcore distributed optim calls finalize_model_grads to finish grad sync
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
                 # main grads are stored in the MainParamsOptimizer wrapper
                 self._optimizer.allreduce_main_grads()
         else:
@@ -793,8 +876,10 @@ def training_step(self, dataloader_iter):
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
         self.megatron_timer_stop('gradient_allreduce')
 
-        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and self.cfg.get(
-            'share_embeddings_and_output_weights', True
+        if (
+            not self.use_mcore_dist_optim
+            and self.cfg.get('pipeline_model_parallel_size', 1) > 1
+            and self.cfg.get('share_embeddings_and_output_weights', True)
         ):
             self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
             # when using pipeline parallelism the first and last stage must keep embeddings in sync
@@ -802,9 +887,21 @@ def training_step(self, dataloader_iter):
             self.megatron_timer_stop('allreduce_first_last_embeddings')
 
         if self.log_memory_usage:
-            mem_reserved = torch.cuda.max_memory_reserved()
+            max_memory_reserved = torch.cuda.max_memory_reserved()
+            memory_allocated = torch.cuda.memory_allocated()
+            self.log(
+                'peak_memory_usage',
+                max_memory_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
+            )
             self.log(
-                'peak_memory_usage', mem_reserved, prog_bar=True, rank_zero_only=True, batch_size=1,
+                'memory_allocated',
+                memory_allocated,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
             )
 
         ## logging
@@ -828,20 +925,29 @@ def training_step(self, dataloader_iter):
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
-            'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'global_step',
+            self.trainer.global_step,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         consumed_samples = self._compute_consumed_samples_after_training_step()
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
-            'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'consumed_samples',
+            consumed_samples,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         if self.rampup_batch_size:
             self.prev_global_batch_size = current_global_batch_size
             self.prev_consumed_samples = consumed_samples
             num_microbatch_calculator.update(
-                consumed_samples=consumed_samples, consistency_check=False,
+                consumed_samples=consumed_samples,
+                consistency_check=False,
             )
             current_global_batch_size = num_microbatch_calculator.current_global_batch_size
             self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -850,20 +956,20 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
     def _append_sequence_parallel_module_grads(self, module, grads):
-        """ Helper method for allreduce_sequence_parallel_gradients"""
+        """Helper method for allreduce_sequence_parallel_gradients"""
 
         for param in module.parameters():
             sequence_parallel_param = getattr(param, 'sequence_parallel', False) or getattr(
@@ -881,9 +987,9 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def allreduce_sequence_parallel_gradients(self):
-        """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-            Modified from megatron-lm:
-            https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+        """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+        Modified from megatron-lm:
+        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
         """
 
         grads = []
@@ -901,12 +1007,11 @@ def allreduce_sequence_parallel_gradients(self):
             buf.copy_(synced)
 
     def allreduce_fsdp_sharding_omitted_gradients(self):
-        """ All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain).
-        """
+        """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain)."""
         assert isinstance(self.model, torch.nn.Module)
         grads = []
-        for param in self.model.parameters():
-            if not isinstance(param, torch.distributed.fsdp.FlatParameter) and param.requires_grad:
+        for param in self.model._ignored_params:
+            if param.requires_grad and param.grad is not None:
                 grad = param.grad
                 grads.append(grad.data)
         if len(grads) > 0:
@@ -949,16 +1054,16 @@ def allreduce_first_last_embeddings(self):
                     torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
 
     def _make_data_iterator_list(self, data_iterator: Iterator) -> List[Iterator]:
-        """ Convert data iterator into form expected by Megatron
-
-            With interleaved pipeline parallelism, Megatron expects a
-            list of one data iterator per model chunk. Each model
-            chunk independently gets data from its data iterator, so
-            we need to interact with the data iterator multiple times
-            for each microbatch step. Instead of incorporating this
-            logic into the data loader, we cache the iterator's output
-            to the first model chunk and reuse it in the other model
-            chunks.
+        """Convert data iterator into form expected by Megatron
+
+        With interleaved pipeline parallelism, Megatron expects a
+        list of one data iterator per model chunk. Each model
+        chunk independently gets data from its data iterator, so
+        we need to interact with the data iterator multiple times
+        for each microbatch step. Instead of incorporating this
+        logic into the data loader, we cache the iterator's output
+        to the first model chunk and reuse it in the other model
+        chunks.
         """
 
         if not isinstance(self.model, list) or len(self.model) == 1:
@@ -1029,6 +1134,7 @@ def get_batch(self, data_iterator, tuning):
             'tokens': data["tokens"],
             'labels': data["labels"],
             'loss_mask': data["loss_mask"],
+            'attention_mask': None if "attention_mask" not in data else data["attention_mask"],
             'position_ids': data["position_ids"],
         }
         if "attention_mask" in data:
@@ -1086,7 +1192,10 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     required_keys.update(('labels', 'loss_mask'))
             if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
                 required_keys.remove('attention_mask')
-            batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()}
+            batch = {
+                key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None
+                for key, val in batch.items()
+            }
 
             # slice batch along sequence dimension for context parallelism
             batch = self.get_batch_on_this_context_parallel_rank(batch)
@@ -1250,10 +1359,10 @@ def id_func(output_tensor):
 
     def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.
@@ -1314,7 +1423,9 @@ def on_validation_epoch_end(self):
             if self.loss_broadcast_src_rank is None:
                 self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
             torch.distributed.broadcast(
-                averaged_loss, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(),
+                averaged_loss,
+                self.loss_broadcast_src_rank,
+                group=parallel_state.get_pipeline_model_parallel_group(),
             )
 
         self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -1395,16 +1506,21 @@ def build_train_valid_test_datasets(self):
                 "reset_position_ids": self.reset_position_ids,
                 "reset_attention_mask": self.reset_attention_mask,
                 "eod_mask_loss": self.eod_mask_loss,
-                "mock": mock_dataset,
+                "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
             }
 
+            data_prefix = self.cfg.data.data_prefix
+
             # support for dict data input type
-            if isinstance(self.cfg.data.data_prefix, DictConfig):
-                _pref = self.cfg.data.data_prefix
-                kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
+            if isinstance(data_prefix, DictConfig):
+                kwargs['blend_per_split'] = [
+                    get_blend_from_list(data_prefix.train),
+                    get_blend_from_list(data_prefix.validation),
+                    get_blend_from_list(data_prefix.test),
+                ]
             else:
-                kwargs['blend'] = self.cfg.data.data_prefix
+                kwargs['blend'] = None if mock_dataset else get_blend_from_list(data_prefix)
                 kwargs["split"] = self.cfg.data.splits_string
 
             if self.cfg.data.get('add_fim', False):
@@ -1415,7 +1531,10 @@ def build_train_valid_test_datasets(self):
                 dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config,
+                dataset_type,
+                train_valid_test_num_samples,
+                is_dataset_built_on_rank,
+                dataset_config,
             ).build()
 
         if self._train_ds is not None:
@@ -1471,9 +1590,11 @@ def build_pretraining_data_loader(
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
@@ -1623,16 +1744,16 @@ def list_available_models(self):
         return None
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        When using pipeline parallelism, we need the global batch to remain on the CPU,
+        since the memory overhead will be too high when using a large number of microbatches.
+        Microbatches are transferred from CPU to GPU inside the pipeline.
         """
         return batch
 
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -1709,9 +1830,9 @@ def on_load_checkpoint(self, checkpoint) -> None:
 
     def on_validation_model_zero_grad(self) -> None:
         """
-         Skip gradient zeroing at the beginning of validation routine.
-         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
-         """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
         if not self.validation_param_sync_overlap:
             super().on_validation_model_zero_grad()
 
@@ -1780,9 +1901,9 @@ def initialize_last_rank_embeddings(self):
                     parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def _reset_activation_checkpointing_args(self):
-        """ Disables activation checkpointing completely and saves the values so that
-            _restore_activation_checkpointing_args can restore them later. This function must always be
-            called before _restore_activation_checkpointing_args.
+        """Disables activation checkpointing completely and saves the values so that
+        _restore_activation_checkpointing_args can restore them later. This function must always be
+        called before _restore_activation_checkpointing_args.
         """
         # Store values to restore them later.
         self.last_activations_checkpoint_granularity = self.cfg.activations_checkpoint_granularity
@@ -1809,9 +1930,9 @@ def _reset_activation_checkpointing_args(self):
                 module.language_model.encoder.activations_checkpoint_layers_per_pipeline = None
 
     def _restore_activation_checkpointing_args(self):
-        """ Restores the activation checkpointing parameters using the values saved by
-            _reset_activation_checkpointing_args. This function must never be called before
-            _reset_activation_checkpointing_args.
+        """Restores the activation checkpointing parameters using the values saved by
+        _reset_activation_checkpointing_args. This function must never be called before
+        _reset_activation_checkpointing_args.
         """
         # Restore config values.
         self.cfg.activations_checkpoint_granularity = self.last_activations_checkpoint_granularity
@@ -1838,9 +1959,9 @@ def _restore_activation_checkpointing_args(self):
                 )
 
     def _reset_sequence_parallelism_args(self):
-        """ Disables sequence parallelism completely and saves the values so that
-            _restore_sequence_parallelism_args can restore them later. This function must always be
-            called before _restore_sequence_parallelism_args.
+        """Disables sequence parallelism completely and saves the values so that
+        _restore_sequence_parallelism_args can restore them later. This function must always be
+        called before _restore_sequence_parallelism_args.
         """
         # Store values to restore them later.
         self.last_sequence_parallel = self.cfg.sequence_parallel
@@ -1857,9 +1978,9 @@ def _reset_sequence_parallelism_args(self):
                     mod.sequence_parallel = False
 
     def _restore_sequence_parallelism_args(self):
-        """ Restores the sequence parallelism parameters using the values saved by
-            _reset_sequence_parallelism_args. This function must never be called before
-            _reset_sequence_parallelism_args.
+        """Restores the sequence parallelism parameters using the values saved by
+        _reset_sequence_parallelism_args. This function must never be called before
+        _reset_sequence_parallelism_args.
         """
         # Restore config values.
         self.cfg.sequence_parallel = self.last_sequence_parallel
@@ -1873,12 +1994,18 @@ def _restore_sequence_parallelism_args(self):
                     mod.sequence_parallel = self.last_sequence_parallel
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core gpt transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
+        if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0:
+            raise ValueError(
+                f"num_layers ({self.cfg.num_layers}) should be divisible by "
+                f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})"
+            )
+
         normalization = self.cfg.get('normalization', 'layernorm').lower()
         layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
         if normalization == 'layernorm':
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 448f912c44d6..28bcbf22ac33 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -94,9 +94,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if hasattr(self, '_nsys_profile_enabled'):
             self._nsys_profile_start_step = self.cfg.nsys_profile.get('start_step', 0)
             self._nsys_profile_end_step = self.cfg.nsys_profile.get('end_step', 0)
+        if hasattr(self, '_memory_profile_enabled'):
+            self._memory_profile_start_step = self.cfg.memory_profile.get('start_step', 0)
+            self._memory_profile_end_step = self.cfg.memory_profile.get('end_step', 0)
 
         self.virtual_tokens = 0
         self.init_global_step = 0
+        self.enforce_divisible_batch = True  # used for gradient accumulation
 
     def setup_metric(self, data_cfg):
         metric_name = "exact_string_match"
@@ -253,9 +257,11 @@ def _build_dataset(self, data_cfg, is_train=True):
 
         # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
         # When using sequence parallel, sequence will further be split by TP size
+        # When using context parallel, sequence is split by CP size as well
         pad_seq_length_to_mult = (
             8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
         )
+        pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1)
 
         dataset_kwargs = {}
         for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):
@@ -290,6 +296,8 @@ def _build_dataset(self, data_cfg, is_train=True):
                 pad_to_max_length=data_cfg.get('pad_to_max_length', False),
                 index_mapping_dir=data_cfg.get('index_mapping_dir', None),
                 prompt_template=data_cfg.get('prompt_template', None),
+                ceil_to_power_2=data_cfg.get('ceil_to_power_2', False),
+                get_attention_mask_from_fusion=data_cfg.get('get_attention_mask_from_fusion', False),
                 virtual_tokens=self.virtual_tokens,
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
@@ -347,9 +355,9 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             token_count_avg = sum(batch['token_count']) / len(batch['token_count'])
 
         # Pass only torch.Tensor to prevent errors when process get_iterator_k_split()
-        batch = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        batch = {k: v for k, v in batch.items() if isinstance(v, (torch.Tensor, list))}
         _, seq_length = batch['tokens'].shape
-        data_iter = get_iterator_k_split(batch, get_num_microbatches())
+        data_iter = get_iterator_k_split(batch, get_num_microbatches(), self.enforce_divisible_batch)
 
         if log_token_counts:
             self.log('seq_length_padded', seq_length, prog_bar=True, batch_size=1)
@@ -359,8 +367,11 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
@@ -804,7 +815,8 @@ def build_train_valid_test_datasets(self, stage):
             logging.info('Building GPT SFT validation datasets.')
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
             self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
-            logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
+            if self._validation_ds:
+                logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
 
         if stage != 'validate':
             self.maybe_build_test()
@@ -847,13 +859,19 @@ def setup_training_dataloader(self):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
             self._train_dl = self.build_data_loader(
-                dataset=self._train_ds, data_cfg=self.cfg.data.train_ds, consumed_samples=consumed_samples,
+                dataset=self._train_ds,
+                data_cfg=self.cfg.data.train_ds,
+                consumed_samples=consumed_samples,
             )
 
     def setup_eval_dataloader(self, datasets, data_cfg):
         dataloaders = []
         for dataset in datasets:
-            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0,)
+            eval_dl = self.build_data_loader(
+                dataset=dataset,
+                data_cfg=data_cfg,
+                consumed_samples=0,
+            )
             dataloaders.append(eval_dl)
         return dataloaders
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
new file mode 100644
index 000000000000..1e5a2f0c15c0
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_model import GriffinModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+
+
+class MegatronGriffinModel(MegatronGPTModel):
+    """
+    Megatron Griffin pretraining.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        self.vocab_size = cfg.get('vocab_size', 256000)
+        self.cfg = cfg
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.mcore_gpt = True
+
+    def model_provider_func(self, pre_process, post_process):
+        model = GriffinModel(
+            config=self.transformer_config,
+            max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+            vocab_size=self.cfg.get('vocab_size', 256000),
+            position_embedding_type=self.cfg.get('position_embedding_type', 'rope'),
+            logits_soft_cap=self.cfg.get('logits_soft_cap', 30.0),
+            rotary_percent=self.cfg.get('rotary_percentage', 0.5),
+            rotary_base=self.cfg.get('rotary_base', 10000),
+        )
+
+        return model
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None):
+
+        output_tensor = self.model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, labels=labels
+        )
+        return output_tensor
+
+    def build_transformer_config(self):
+        transformer_config = super().build_transformer_config()
+        transformer_config.activations_checkpoint_recurrent = self.cfg.get('activations_checkpoint_recurrent', False)
+        transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', True)
+        transformer_config.layernorm_zero_centered_gamma = self.cfg.get('layernorm_zero_centered_gamma', True)
+        assert (
+            not transformer_config.activations_checkpoint_recurrent or not transformer_config.recompute_granularity
+        ), "Either the recurrent checkpoiting or the full/custom checkpointing should be set"
+
+        return transformer_config
+
+    def on_validation_epoch_end(self):
+
+        averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+        return averaged_loss
+
+    def sharded_state_dict(self, prefix: str = ''):
+        return None
+
+    def _reset_activation_checkpointing_args(self):
+        return
+
+    def _restore_activation_checkpointing_args(self):
+        return
+
+    def _reset_sequence_parallelism_args(self):
+        return
+
+    def _restore_sequence_parallelism_args(self):
+        return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
new file mode 100644
index 000000000000..c53d231b2719
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf import DictConfig
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+
+try:
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+__all__ = ['MegatronGriffinSFTModel']
+
+
+class MegatronGriffinSFTModel(MegatronGPTSFTModel, MegatronGriffinModel):
+    """
+    Megatron Griffin Supervised Fine-Tuning
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        if not HAVE_APEX:
+            raise ImportError(
+                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        super().__init__(cfg, trainer=trainer)
+        self.mcore_gpt = True
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
+
+    def _reset_activation_checkpointing_args(self):
+        pass
+
+    def on_validation_model_zero_grad(self) -> None:
+        """
+         Skip gradient zeroing at the beginning of validation routine.
+         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+         """
+        if not self.validation_param_sync_overlap:
+            MegatronBaseModel.on_validation_model_zero_grad(self)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 459bf5b71c7e..8fe215bcc9af 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -190,9 +190,10 @@ def configure_optimizers(self):
                 param._disable_overlap_grad_sync = True
 
             # Make sure embedding grads are reduced in FP32
+            with_fp32_embedding_grads = self.cfg.get('with_fp32_embedding_grads', True)
             for name, param in self.named_parameters():
                 if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
-                    param._with_fp32_optimizer = True
+                    param._with_fp32_optimizer = with_fp32_embedding_grads
 
         return super().configure_optimizers()
 
@@ -346,8 +347,8 @@ def _execute_fwd_bwd_function(self, data_iterator, forward_only, tensor_shape, d
 
     def fwd_bwd_step(self, dataloader_iter, forward_only):
         """
-            Dataloader produces a global batch which is turned into a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Dataloader produces a global batch which is turned into a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # Get seq length of batch
         tensor_shape = [self.max_encoder_seq_length, self.cfg.micro_batch_size, self.cfg.encoder.hidden_size]
@@ -361,12 +362,12 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
 
     def training_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            Batch should be a list of microbatches and those microbatches should on CPU.
-            Microbatches are then moved to GPU during the pipeline.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        Batch should be a list of microbatches and those microbatches should on CPU.
+        Microbatches are then moved to GPU during the pipeline.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # we zero grads here because we also call backward in the megatron fwd/bwd functions
         self._optimizer.zero_grad()
@@ -408,7 +409,11 @@ def training_step(self, dataloader_iter):
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
-            'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'global_step',
+            self.trainer.global_step,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
@@ -432,21 +437,21 @@ def max_encoder_seq_length(self) -> int:
         return self.cfg.seq_length
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-           Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -768,10 +773,16 @@ def _test_validation_epoch_end(self, step_outputs, prefix):
     def on_validation_epoch_end(self):
         # FIXME: do we need this? 'global_step' is logged in training_step
         self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1)
-        return self._test_validation_epoch_end(step_outputs=self.validation_step_outputs, prefix="val",)
+        return self._test_validation_epoch_end(
+            step_outputs=self.validation_step_outputs,
+            prefix="val",
+        )
 
     def on_test_epoch_end(self):
-        return self._test_validation_epoch_end(step_outputs=self.test_step_outputs, prefix="test",)
+        return self._test_validation_epoch_end(
+            step_outputs=self.test_step_outputs,
+            prefix="test",
+        )
 
     def loss_func(self, loss_mask, tokens_loss):
         """
@@ -784,7 +795,7 @@ def loss_func(self, loss_mask, tokens_loss):
         return loss
 
     def process_micro_batch(self, micro_batch):
-        """ Micro batch returned by MegatronT5 dataloader"""
+        """Micro batch returned by MegatronT5 dataloader"""
 
         data_b = micro_batch
 
@@ -800,8 +811,8 @@ def process_micro_batch(self, micro_batch):
         return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask
 
     def _process_global_batch_without_megatron_batch_sampler(self, global_batch, tokenizer=None):
-        """ Prepares the global batch for megatron-core fwd/bwd functions.
-            Global batch is a list of micro batches.
+        """Prepares the global batch for megatron-core fwd/bwd functions.
+        Global batch is a list of micro batches.
         """
         tokenizer = self.tokenizer if tokenizer is None else tokenizer
         text_enc_list = []
@@ -911,9 +922,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, num_workers):
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
@@ -1074,7 +1087,11 @@ def dummy():
         # Setting it to anything else will cause hanging due to tensor shape mismatches.
         output_tensor = fwd_bwd_func(
             forward_step_func=forward_step_func,
-            data_iterator=iter([batch_for_pipeline,]),
+            data_iterator=iter(
+                [
+                    batch_for_pipeline,
+                ]
+            ),
             model=[self.enc_dec_model],
             forward_only=True,
             num_microbatches=1,
@@ -1189,6 +1206,10 @@ def dummy():
             global_batch_per_gpu = tokens_enc.size(0)
             device = tokens_enc.device
             encoder_seq_length = tokens_enc.size(1)
+        elif encoder_input is not None:
+            global_batch_per_gpu = encoder_input.size(0)
+            device = encoder_input.device
+            encoder_seq_length = encoder_input.size(1)
         else:
             global_batch_per_gpu = enc_output.size(0)
             device = enc_output.device
@@ -1240,7 +1261,11 @@ def dummy():
 
             output_tensor = fwd_bwd_func(
                 forward_step_func=forward_step_func,
-                data_iterator=iter([batch_for_pipeline,]),
+                data_iterator=iter(
+                    [
+                        batch_for_pipeline,
+                    ]
+                ),
                 model=[self.enc_dec_model],
                 forward_only=True,
                 num_microbatches=1,
@@ -1320,13 +1345,13 @@ def dummy():
                         # choose top-k hypotheses with length penalty applied
                         len_penalties = compute_beam_search_len_penalty(decoder_seq_lengths, beam_alpha)
                         scores = scores / len_penalties
-                        scores, indices = sample_token_fn(scores.view(-1, beam_size ** 2), dim=1, log_softmax=False)
+                        scores, indices = sample_token_fn(scores.view(-1, beam_size**2), dim=1, log_softmax=False)
                         scores = scores.view(-1, 1) * len_penalties
 
                         # select predicted sequences which correspond to the chosen hypotheses
                         predicted_tokens_dec = predicted_tokens_dec.unsqueeze(1).repeat(1, beam_size, 1)
                         predicted_tokens_dec = torch.cat((predicted_tokens_dec, token_ids.unsqueeze(2)), dim=2)
-                        predicted_tokens_dec = predicted_tokens_dec.view(batch_size, beam_size ** 2, -1)
+                        predicted_tokens_dec = predicted_tokens_dec.view(batch_size, beam_size**2, -1)
                         p_len = predicted_tokens_dec.size(2)
                         predicted_tokens_dec_ids = indices.unsqueeze(2).repeat(1, 1, p_len)
                         predicted_tokens_dec = predicted_tokens_dec.gather(1, predicted_tokens_dec_ids).view(-1, p_len)
@@ -1334,7 +1359,7 @@ def dummy():
                         # select logits which correspond to the chosen hypotheses
                         predicted_log_probs = predicted_log_probs.unsqueeze(1).repeat(1, beam_size, 1)
                         predicted_log_probs = torch.cat((predicted_log_probs, log_probs.unsqueeze(2)), dim=2)
-                        predicted_log_probs = predicted_log_probs.view(batch_size, beam_size ** 2, -1)
+                        predicted_log_probs = predicted_log_probs.view(batch_size, beam_size**2, -1)
                         predicted_log_probs = predicted_log_probs.gather(1, predicted_tokens_dec_ids[:, :, 1:]).view(
                             -1, p_len - 1
                         )
@@ -1413,11 +1438,13 @@ def dummy():
 
     def complete(self, request: Dict):
         """
-            Autoregressively invokes language model in the inference mode
+        Autoregressively invokes language model in the inference mode
+
         Args:
             request: Dictionary with the following fields
                 * prompt: a string which text the model should complete.
                 * tokens_to_generate: how many tokens to generate while doing prompt completion.
+
         Returns:
             response: A python dictionary with the following fields
                 * prompt: original text of the prompt
@@ -1478,16 +1505,16 @@ def complete(self, request: Dict):
         return response
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        When using pipeline parallelism, we need the global batch to remain on the CPU,
+        since the memory overhead will be too high when using a large number of microbatches.
+        Microbatches are transferred from CPU to GPU inside the pipeline.
         """
         return batch
 
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -1498,8 +1525,7 @@ def list_available_models(self):
         pass
 
     def build_model_parallel_config(self):
-        """ Hidden size needs to be set from the cfg.encoder for the pipeline schedule.
-        """
+        """Hidden size needs to be set from the cfg.encoder for the pipeline schedule."""
 
         model_parallel_config = super().build_model_parallel_config()
         try:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index 377ccbee163b..2a8e5713573b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -409,7 +409,7 @@ def id_func(output_tensor):
         return fwd_output_only_func
 
     def build_retro_config(self) -> RetroConfig:
-        """ This method build RetroConfig from the already built TransformerConfig
+        """This method build RetroConfig from the already built TransformerConfig
         by adding Retro relevant variables. This method runs after running build_transformer_config() method.
         """
         retro_config = self.transformer_config
@@ -445,7 +445,10 @@ def build_retro_config(self) -> RetroConfig:
             except Exception as e:
                 raise Exception(
                     "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
-                    % (os.getenv("NVTE_FLASH_ATTN", "[unset]"), os.getenv("NVTE_FUSED_ATTN", "[unset]"),)
+                    % (
+                        os.getenv("NVTE_FLASH_ATTN", "[unset]"),
+                        os.getenv("NVTE_FUSED_ATTN", "[unset]"),
+                    )
                 )
 
         return retro_config
@@ -469,9 +472,9 @@ def build_train_valid_test_datasets(self):
         ]
 
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[
-                1
-            ] = 1  # This is to make sure we only have one epoch on every validation iteration
+            train_valid_test_num_samples[1] = (
+                1  # This is to make sure we only have one epoch on every validation iteration
+            )
 
         self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets(
             cfg=self.cfg,
@@ -539,8 +542,11 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
@@ -596,10 +602,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
 
     def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index 65d8645688fd..37195f1df142 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -60,8 +60,7 @@
 
 
 class NLPModel(ModelPT, Exportable):
-    """Base class for NLP Models.
-    """
+    """Base class for NLP Models."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
 
@@ -120,7 +119,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
             if cfg.get('language_model').get('config_file'):
                 config_file = self.register_artifact('language_model.config_file', cfg.language_model.config_file)
             bert_model = get_lm_model(
-                config_file=config_file, config_dict=config_dict, vocab_file=vocab_file, trainer=trainer, cfg=cfg,
+                config_file=config_file,
+                config_dict=config_dict,
+                vocab_file=vocab_file,
+                trainer=trainer,
+                cfg=cfg,
             )
             # set the tokenizer if it is not initialized explicitly
             if ((hasattr(self, 'tokenizer') and self.tokenizer is None) or not hasattr(self, 'tokenizer')) and hasattr(
@@ -146,16 +149,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
             self.register_bert_model()
 
     def register_artifact(
-        self, config_path: str, src: str, verify_src_exists: bool = False,
+        self,
+        config_path: str,
+        src: str,
+        verify_src_exists: bool = False,
     ):
-        """ Overrides ModelPT register_artifact default behavior.
+        """Overrides ModelPT register_artifact default behavior.
         NLP models usually need artifacts that are optional."""
         return super().register_artifact(config_path, src, verify_src_exists=verify_src_exists)
 
     @rank_zero_only
     def register_bert_model(self):
-        """Adds encoder config to .nemo archive for Jarvis.
-        """
+        """Adds encoder config to .nemo archive for Jarvis."""
         # check if there is an encoder, warn if not
         if self.bert_model is not None:
             # get encoder config and create source for artifact
@@ -462,6 +467,13 @@ def restore_from(
             save_restore_connector = NLPSaveRestoreConnector()
         if os.path.isdir(restore_path):
             save_restore_connector.model_extracted_dir = restore_path
+        if (
+            isinstance(override_config_path, DictConfig)
+            and override_config_path.get('use_cpu_initialization', False)
+            and map_location is None
+        ):
+            logging.info('use_cpu_initialization is True, loading checkpoint on CPU')
+            map_location = 'cpu'
         return super().restore_from(
             restore_path, override_config_path, map_location, strict, return_config, save_restore_connector, trainer
         )
diff --git a/nemo/collections/nlp/models/rag/__init__.py b/nemo/collections/nlp/models/rag/__init__.py
new file mode 100644
index 000000000000..15434bc2e603
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
diff --git a/nemo/collections/nlp/models/rag/custom_bert_embedder.py b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
new file mode 100644
index 000000000000..e2f26fadf247
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
@@ -0,0 +1,145 @@
+from typing import Any, List
+
+import torch
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.embeddings import BaseEmbedding
+from omegaconf import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoBertEmbeddings(BaseEmbedding):
+    _model: MegatronBertEmbeddingModel = PrivateAttr()
+    _model_cfg: DictConfig = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        embed_batch_size: int = 16,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+
+        # setup/override model config
+        model_cfg = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, return_config=True
+        )
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+
+        super().__init__(
+            embed_batch_size=embed_batch_size,
+            **kwargs,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "nemo_bert_embeddings"
+
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+
+    def _construct_forward_input(self, texts: List[str]):
+        # this method construct model's forward input arguments from texts, following the constructing step in nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+
+        # retrieve arguments from model_config
+        max_seq_length = self._model_cfg.encoder_seq_length
+
+        # tokenize text
+        input_ids = [self._model.tokenizer.text_to_ids(text) for text in texts]
+
+        # truncate input_ids
+        input_ids = [item[: (max_seq_length - 1)] for item in input_ids]
+
+        # add bos and eos
+        input_ids = [([self._model.tokenizer.bos_id] + item + [self._model.tokenizer.eos_id]) for item in input_ids]
+
+        # pad input_ids
+        def _ceil_to_nearest(n, m):
+            return (n + m - 1) // m * m
+
+        lengths = [len(item) for item in input_ids]
+        max_length = min(max_seq_length, _ceil_to_nearest(max(lengths), 16))
+        assert max_length <= max_seq_length
+        input_ids = [item + [self._model.tokenizer.pad_id] * (max_length - len(item)) for item in input_ids]
+        input_ids = torch.LongTensor(input_ids)
+
+        # construct attention_mask
+        def _create_attention_mask2(max_length, item_lengh):
+            """Create `attention_mask`.
+            Args:
+                input_ids: A 1D tensor that holds the indices of tokens.
+            """
+            # seq_length = len(input_ids)
+            # `attention_mask` has the shape of [1, seq_length, seq_length]
+            attention_mask = torch.zeros(max_length)
+            attention_mask[:item_lengh] = 1
+            return attention_mask
+
+        attention_mask = [_create_attention_mask2(max_length, len) for len in lengths]
+        attention_mask = torch.stack(attention_mask)
+
+        # construct token_type_ids
+        token_type_ids = torch.zeros_like(input_ids)
+
+        processed_batch = {
+            'input_ids': input_ids,
+            'token_type_ids': token_type_ids,
+            'attention_mask': attention_mask,
+        }
+
+        return processed_batch
+
+    def _get_query_embedding(self, query: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([query])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embedding(self, text: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([text])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        constructed_forward_input = self._construct_forward_input(texts)
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings.tolist()
diff --git a/nemo/collections/nlp/models/rag/custom_gpt_llm.py b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
new file mode 100644
index 000000000000..bcd52b3f9b16
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
@@ -0,0 +1,130 @@
+from typing import Any
+
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.llms import CompletionResponse, CompletionResponseGen, CustomLLM, LLMMetadata
+from llama_index.core.llms.callbacks import llm_completion_callback
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoGPTLLM(CustomLLM):
+    context_window: int = 2048
+    num_output: int = 256
+    model_name: str = "NeMo LLM"
+    dummy_response: str = "My response"
+
+    length_params: LengthParam = {
+        "max_length": 500,
+        "min_length": 0,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": True,
+        "temperature": 1.0,
+        "top_k": 0,
+        "top_p": 1.0,
+        "repetition_penalty": 1.0,
+        "add_BOS": True,
+        "all_probs": False,
+        "compute_logprob": False,
+        "end_strings": ["<|endoftext|>"],
+    }
+
+    _model: Any = PrivateAttr()
+    _model_cfg: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+
+        tensor_model_parallel_size = 1
+        pipeline_model_parallel_size = 1
+
+        # trainer required for restoring model parallel models
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+        assert (
+            trainer_config["devices"] * trainer_config['num_nodes']
+            == tensor_model_parallel_size * pipeline_model_parallel_size
+        ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+        # setup/override model config
+        model_cfg = MegatronGPTModel.restore_from(restore_path=model_path, trainer=trainer, return_config=True)
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronGPTModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+        super().__init__(**kwargs)
+
+        # update LLM metadata
+        self.context_window = self._model_cfg.encoder_seq_length
+
+        # update inference params
+        length_params: LengthParam = {
+            "max_length": cfg.generating.inference.tokens_to_generate,
+            "min_length": cfg.generating.inference.min_tokens_to_generate,
+        }
+
+        sampling_params: SamplingParam = {
+            "use_greedy": cfg.generating.inference.greedy,
+            "temperature": cfg.generating.inference.temperature,
+            "top_k": cfg.generating.inference.top_k,
+            "top_p": cfg.generating.inference.top_p,
+            "repetition_penalty": cfg.generating.inference.repetition_penalty,
+            "add_BOS": cfg.generating.inference.add_BOS,
+            "all_probs": cfg.generating.inference.all_probs,
+            "compute_logprob": cfg.generating.inference.compute_logprob,
+            "end_strings": cfg.generating.inference.end_strings,
+        }
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """Get LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        return CompletionResponse(text=text_response)
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        response = ""
+        for token in text_response:
+            response += token
+            yield CompletionResponse(text=response, delta=token)
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index a5e886f3b479..a85c155cc0a8 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -37,6 +37,8 @@
     LoraDenseAttentionAdapterConfig,
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
+    LoraUnfusedHto4HAdapterConfig,
+    LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
     ParallelLinearAdapterConfig,
     PromptEncoderAdapterConfig,
@@ -67,13 +69,18 @@ def mcore_register_adapters(self):
         Setup NeMo LoRA or IA3 adapter to this MCore layer.
         """
         self.set_accepted_adapter_types(
-            [LoraKQVAdapterConfig._target_, LoraDenseAttentionAdapterConfig._target_, InfusedAdapterConfig._target_]
+            [
+                LoraUnfusedKQVAdapterConfig._target_,
+                LoraKQVAdapterConfig._target_,
+                LoraDenseAttentionAdapterConfig._target_,
+                InfusedAdapterConfig._target_,
+            ]
         )
         self.linear_qkv.return_layernorm_output = True  # need layernorm output for lora mlp
         if (
             self.config.sequence_parallel
             and hasattr(self.linear_qkv, "return_layernorm_output_gathered")
-            and not self.config.tp_comm_overlap
+            and not self.linear_qkv.ub_overlap_ag
         ):
             # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
             # to perform the redundant gather in the adapter module, unless TP communication
@@ -135,11 +142,19 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         if SplitAlongDim is not None:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list,)
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
         else:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3,)
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
 
         # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
@@ -156,6 +171,14 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
                 vls = value.shape
                 value = value_infused_adapter(value.reshape(vls[0], vls[1], -1)).reshape(vls).to(query.dtype)
 
+            lora_unfused_kqv_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_KQV_ADAPTER)
+            if lora_unfused_kqv_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_KQV_ADAPTER]['enabled']:
+                assert lora_kqv_adapter is None
+                if layernorm_output is not None:
+                    lq, lk, lv = lora_unfused_kqv_adapter(layernorm_output)
+                else:
+                    lq, lk, lv = lora_unfused_kqv_adapter(hidden_states)
+                query, key, value = query + lq, key + lk, value + lv
         return query, key, value
 
     def forward(
@@ -216,11 +239,21 @@ def forward(
 
         if self.checkpoint_core_attention:
             core_attn_out = self._checkpointed_attention_forward(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
         else:
             core_attn_out = self.core_attention(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
 
         if packed_seq_params is not None:
@@ -251,7 +284,12 @@ def mcore_register_adapters(self):
         Setup NeMo IA3 adapter to this MCore layer.
         """
         self.set_accepted_adapter_types(
-            [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
+            [
+                LoraUnfusedHto4HAdapterConfig._target_,
+                LoraHto4HAdapterConfig._target_,
+                Lora4HtoHAdapterConfig._target_,
+                MLPInfusedAdapterConfig._target_,
+            ]
         )  # only self attn (packed qkv) for now
         self.linear_fc1.return_layernorm_output = True  # need layernorm output for lora mlp
         if (
@@ -274,9 +312,17 @@ def forward(self, hidden_states):
 
         # LoRA logic
         if self.is_adapter_available():
-            lora_linear_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
-            if lora_linear_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
-                lora_output = lora_linear_fc1_adapter(layernorm_output)
+            lora_adapter = None
+            lora_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
+            lora_unfused_fc1_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_Hto4H_ADAPTER)
+            if lora_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
+                lora_adapter = lora_fc1_adapter
+            if lora_unfused_fc1_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_Hto4H_ADAPTER]['enabled']:
+                assert lora_adapter is None, "Expected only one of LORA_Hto4H_ADAPTER or LORA_UNFUSED_Hto4H_ADAPTER"
+                lora_adapter = lora_unfused_fc1_adapter
+
+            if lora_adapter:
+                lora_output = lora_adapter(layernorm_output)
                 intermediate_parallel = intermediate_parallel + lora_output
 
         if self.config.bias_activation_fusion:
@@ -288,7 +334,9 @@ def forward(self, hidden_states):
                     intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
             elif self.activation_func == F.silu and self.config.gated_linear_unit:
                 intermediate_parallel = bias_swiglu_impl(
-                    intermediate_parallel, bias_parallel, self.config.activation_func_fp8_input_store,
+                    intermediate_parallel,
+                    bias_parallel,
+                    self.config.activation_func_fp8_input_store,
                 )
 
             else:
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 5037bb1b3634..61903e6b3673 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -75,11 +75,13 @@ class AdapterName(str, enum.Enum):
     POST_ATTN_ADAPTER = 'adapter_2'
     PTUNING_ADAPTER = "ptuning_adapter"
     LORA_KQV_ADAPTER = "lora_kqv_adapter"
+    LORA_UNFUSED_KQV_ADAPTER = "lora_unfused_kqv_adapter"
     LORA_KV_ADAPTER = "lora_kv_adapter"
     LORA_Q_ADAPTER = "lora_q_adapter"
     MM_LINEAR_ADAPTER = "mm_linear_adapter"
     LORA_DENSE_ATTENTION_ADAPTER = "lora_dense_attention_adapter"
     LORA_Hto4H_ADAPTER = "lora_hto4h_adapter"
+    LORA_UNFUSED_Hto4H_ADAPTER = "lora_unfused_hto4h_adapter"
     LORA_4HtoH_ADAPTER = "lora_4htoh_adapter"
     MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter"
     PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter"
@@ -163,8 +165,6 @@ def __init__(
         self.alpha = alpha if alpha is not None else self.dim
         self.input_is_parallel = input_is_parallel
         self.dropout_position = dropout_position
-        self.tp_world_size = None
-        self.tp_group = None
         self.use_a2a = a2a_experimental
 
         # megatron_gpt_peft_models will provide this arg, but deprecated ones do not.
@@ -210,8 +210,6 @@ def __init__(
             lin_out_gather_output = True if input_is_parallel else False
             if self.use_a2a and input_is_parallel and self._sequence_parallel:
                 lin_out_gather_output = False
-                self.tp_world_size = get_tensor_model_parallel_world_size()
-                self.tp_group = get_tensor_model_parallel_group()
             self.linear_out = ColumnParallelLinear(
                 dim,
                 out_features,
@@ -254,7 +252,9 @@ def __init__(
             from pkg_resources import packaging
 
             te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("1.5.0dev") and not model_parallel_config.tp_comm_overlap:
+            if te_version >= packaging.version.Version("1.5.0dev") and (
+                not self.input_is_parallel and model_parallel_config.tp_comm_disable_qkv
+            ):
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False
                 # unless TP communication overlap is used
@@ -273,7 +273,9 @@ def _get_init_fn(self, init_method: str):
             raise NotImplementedError("out_init_method should be zero, normal, kaiming or xavier")
         return init_fn
 
-    def adapter_unfreeze(self,):
+    def adapter_unfreeze(
+        self,
+    ):
         """
         Can be customized to allow for selective training of only some params in the PEFT.
         """
@@ -303,7 +305,7 @@ def forward(self, x):
             # this function also handles the backward pass correctly
             if self.use_a2a:
                 # all2all hidden_size / TP to seq_len / TP
-                x = all2all_hp2sp(x, self.tp_world_size, self.tp_group)
+                x = all2all_hp2sp(x)
             else:
                 x = scatter_to_sequence_parallel_region(x)
 
@@ -327,9 +329,9 @@ class _All2AllHp2Sp(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, world_size, group):
-        ctx.world_size = world_size
-        ctx.group = group
+    def forward(ctx, input_):
+        world_size = get_tensor_model_parallel_world_size()
+        group = get_tensor_model_parallel_group()
         send_list = list(input_.chunk(world_size, dim=0))
         send_list = [tensor.contiguous() for tensor in send_list]
         receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
@@ -339,16 +341,18 @@ def forward(ctx, input_, world_size, group):
 
     @staticmethod
     def backward(ctx, grad_output):
-        send_list = list(grad_output.chunk(ctx.world_size, dim=-1))
+        world_size = get_tensor_model_parallel_world_size()
+        group = get_tensor_model_parallel_group()
+        send_list = list(grad_output.chunk(world_size, dim=-1))
         send_list = [tensor.contiguous() for tensor in send_list]
-        receive_list = [torch.empty_like(send_list[0]) for _ in range(ctx.world_size)]
-        torch.distributed.all_to_all(receive_list, send_list, group=ctx.group)
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=group)
         x = torch.cat(receive_list, dim=0)
-        return x, None, None
+        return x
 
 
-def all2all_hp2sp(input_, world_size, group):
-    return _All2AllHp2Sp.apply(input_, world_size, group)
+def all2all_hp2sp(input_):
+    return _All2AllHp2Sp.apply(input_)
 
 
 @dataclass
@@ -400,7 +404,7 @@ class LoraQAdapter(ParallelLinearAdapter):
 
 class LoraDenseAttentionAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -409,7 +413,7 @@ class LoraDenseAttentionAdapter(ParallelLinearAdapter):
 
 class LoraHto4HAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -418,7 +422,7 @@ class LoraHto4HAdapter(ParallelLinearAdapter):
 
 class Lora4HtoHAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -457,6 +461,156 @@ class Lora4HtoHAdapterConfig(ParallelLinearAdapterConfig):
     input_is_parallel: bool = True
 
 
+class LoraUnfusedHto4HAdapter(nn.Module, AdapterModuleUtil):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        dim: int,
+        activation: str = 'swish',
+        norm_position: Optional[str] = 'post',
+        norm_type: Optional[str] = 'mixedfusedlayernorm',
+        column_init_method: str = 'xavier',  # TODO: (@adithyare) should rename this to input_init_method to be more precise.
+        row_init_method: str = 'zero',  # TODO: (@adithyare) should rename this to output_init_method to be more precise.
+        gather_output: bool = True,
+        input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
+        dropout: float = 0.0,
+        model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
+        dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
+        **kwargs,
+    ):
+        super().__init__()
+        self.gate_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features // 2,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+        self.up_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features // 2,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+
+    def forward(self, x):
+        gate_x = self.gate_adapter(x)
+        up_x = self.up_adapter(x)
+        x = torch.concat([gate_x, up_x], dim=2)
+        return x
+
+
+@dataclass
+class LoraUnfusedHto4HAdapterConfig(ParallelLinearAdapterConfig):
+    _target_: str = "{0}.{1}".format(LoraUnfusedHto4HAdapter.__module__, LoraUnfusedHto4HAdapter.__name__)
+
+
+class LoraUnfusedKQVAdapter(nn.Module, AdapterModuleUtil):
+    def __init__(
+        self,
+        in_features: int,
+        dim: int,
+        num_query_groups: int,
+        kv_channels: int,
+        activation: str = 'swish',
+        norm_position: Optional[str] = 'post',
+        norm_type: Optional[str] = 'mixedfusedlayernorm',
+        column_init_method: str = 'xavier',  # TODO: (@adithyare) should rename this to input_init_method to be more precise.
+        row_init_method: str = 'zero',  # TODO: (@adithyare) should rename this to output_init_method to be more precise.
+        gather_output: bool = True,
+        input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
+        dropout: float = 0.0,
+        model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
+        dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
+        **kwargs,
+    ):
+        super().__init__()
+        if num_query_groups is not None and kv_channels is not None:
+            out_features = kv_channels * num_query_groups
+        else:
+            out_features = in_features
+
+        self.kv_channels = kv_channels
+        adapter_args = {
+            "in_features": in_features,
+            "out_features": in_features,
+            "dim": dim,
+            "activation": activation,
+            "norm_position": norm_position,
+            "norm_type": norm_type,
+            "column_init_method": column_init_method,
+            "row_init_method": row_init_method,
+            "gather_output": gather_output,
+            "input_is_parallel": input_is_parallel,
+            "dropout": dropout,
+            "model_parallel_config": model_parallel_config,
+            "alpha": alpha,
+            "dropout_position": dropout_position,
+            "a2a_experimental": a2a_experimental,
+        }
+
+        self.q_adapter = ParallelLinearAdapter(**adapter_args)
+        adapter_args["out_features"] = out_features
+        self.k_adapter = ParallelLinearAdapter(**adapter_args)
+        self.v_adapter = ParallelLinearAdapter(**adapter_args)
+
+    def forward(self, x):
+        qx = self.q_adapter(x)
+        kx = self.k_adapter(x)
+        vx = self.v_adapter(x)
+        qx = qx.reshape(qx.shape[0], qx.shape[1], -1, self.kv_channels)
+        kx = kx.reshape(kx.shape[0], kx.shape[1], -1, self.kv_channels)
+        vx = vx.reshape(vx.shape[0], vx.shape[1], -1, self.kv_channels)
+        return qx, kx, vx
+
+
+@dataclass
+class LoraUnfusedKQVAdapterConfig(AdapterConfig):
+    in_features: int
+    dim: int
+    num_query_groups: int
+    kv_channels: int
+    activation: str = 'swish'
+    norm_position: Optional[str] = 'post'
+    norm_type: Optional[str] = 'mixedfusedlayernorm'
+    column_init_method: str = 'xavier'
+    row_init_method: str = 'zero'
+    gather_output: bool = True
+    input_is_parallel: bool = False
+    dropout: float = 0.0
+    dropout_position: str = 'post'
+    alpha: float | None = None
+    network_alpha: int | None = None
+    a2a_experimental: bool = False
+    _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__)
+
+
 class PromptEncoderAdapter(nn.Module, AdapterModuleUtil):
     """
     The Tensor Parallel MLP prompt encoder network that is used to generate the virtual
@@ -536,14 +690,20 @@ def set_inference_table(self, prompt_representation: torch.Tensor):
         self.is_inference_ready = True
         return True
 
-    def clear_inference_table(self,):
+    def clear_inference_table(
+        self,
+    ):
         self.inference_table.fill_(0.0)
         self.is_inference_ready = False
 
-    def get_inference_table(self,):
+    def get_inference_table(
+        self,
+    ):
         return self.inference_table.data
 
-    def inner_forward(self,):
+    def inner_forward(
+        self,
+    ):
         input_embeds = self.embedding(self.indices).unsqueeze(0)
         intermediate_parallel, bias_parallel = self.first(input_embeds)
         intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
@@ -738,6 +898,29 @@ class LoraKQVAdapterWeightTyingConfig(ParallelLinearAdapterWeightTyingConfig):
     _target_: str = "{0}.{1}".format(LoraKQVAdapterWeightTying.__module__, LoraKQVAdapterWeightTying.__name__)
 
 
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[3] ** 0.5)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], -1, vit_embeds.shape[-1])
+        return vit_embeds
+
+    def flat_square(self, x):
+        b, T, F, h, w, c = x.size()
+        if w % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, h, 1, c), dtype=x.dtype).to(x.device)], dim=4)
+            b, T, F, h, w, c = x.size()
+        if h % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, 1, w, c), dtype=x.dtype).to(x.device)], dim=3)
+            b, T, F, h, w, c = x.size()
+        x = x.view(b, T, F, h, int(w / 2), int(c * 2))
+        x = x.permute(0, 1, 2, 4, 3, 5).contiguous()
+        x = x.view(b, T, F, int(h / 2), int(w / 2), int(c * 4))
+        return x
+
+
 class MultimodalProjectorAdapter(nn.Module, AdapterModuleUtil):
     def __init__(self, adapter_type: str, in_features: int, out_features: int, bias: bool, **kwargs) -> None:
         super().__init__()
@@ -746,6 +929,14 @@ def __init__(self, adapter_type: str, in_features: int, out_features: int, bias:
             self.mm_projector = torch.nn.Linear(in_features, out_features, bias)
         elif adapter_type == 'identity':
             self.mm_projector = lambda x: x
+        elif adapter_type == 'mlp_downsample':
+            self.mm_projector = torch.nn.Sequential(
+                DownSampleBlock(),
+                torch.nn.LayerNorm(in_features * 4),
+                torch.nn.Linear(in_features * 4, out_features, bias),
+                torch.nn.GELU(),
+                torch.nn.Linear(out_features, out_features, bias),
+            )
         else:
             mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', adapter_type)
             if mlp_gelu_match:
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
new file mode 100644
index 000000000000..e29744ce4d4d
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import version
+from typing import TYPE_CHECKING, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from pkg_resources import packaging
+from torch import Tensor, nn
+
+from nemo.collections.nlp.parts.peft_config import LORA_CONFIG_TO_MCORE_MAP, get_target_modules
+from nemo.utils import logging
+
+te_version = packaging.version.Version(version("transformer-engine"))
+
+if TYPE_CHECKING:
+    from megatron.core.models.gpt import MCoreGPTModel
+    from omegaconf import DictConfig
+
+
+class NF4Weight(nn.Parameter):
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        is_nf4_quantized: bool = False,
+        block_size: int = 64,
+        scale_block_size: int = 256,
+    ):
+        self = torch.Tensor._make_subclass(cls, data, require_grad=False)
+        self._nf4_quantizer = None
+        self.is_nf4_quantized = is_nf4_quantized
+        self.block_size = block_size
+        self.scale_block_size = scale_block_size
+        return self
+
+    def quantize(self, device='cuda') -> torch.Tensor:
+        from modelopt.torch.quantization.nn import TensorQuantizer
+        from modelopt.torch.quantization.tensor_quant import QuantDescriptor
+
+        # initialize the quantizer
+        nf4_desc = QuantDescriptor(
+            num_bits=4,
+            block_sizes={-1: self.block_size, "scale_bits": 8, "scale_block_sizes": {-1: self.scale_block_size}},
+            fake_quant=False,
+        )
+        self._nf4_quantizer = TensorQuantizer(nf4_desc)
+
+        # quantize on GPU directly
+        nf4_tensor = self._nf4_quantizer(self.data.to(device))
+        self.quantized_data = nf4_tensor
+        self.is_nf4_quantized = True
+        return self
+
+    def dequantize(self):
+        assert self.is_nf4_quantized, "NF4 Tensor is not yet quantized, cannot dequantize."
+        return self._nf4_quantizer(self.quantized_data)
+
+    def cuda(self, device=None, non_blocking=False):
+        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
+
+    def to(self, *args, **kwargs):
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+
+        if device is not None and device.type == "cuda":
+            # Note: self.data remains on CPU. Only self.quantized_data is on GPU
+            return self.quantize() if not self.is_nf4_quantized else self
+        else:
+            return NF4Weight(
+                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+                self.is_nf4_quantized,
+                self.block_size,
+                self.scale_block_size,
+            )
+
+    def __repr__(self, *, tensor_contents=None):
+        if self.is_nf4_quantized:
+            return f"NF4Weight(is_nf4_quantized=True, quantized_data={self.quantized_data}"
+        else:
+            return f"NF4Weight(is_nf4_quantized=False, data={self.data}"
+
+
+class _LinearNF4(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, weight: NF4Weight):
+        ctx.nf4_weight = weight
+        return F.linear(input, weight.dequantize().to(input.device))
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        weight: NF4Weight = ctx.nf4_weight
+        return grad_output @ weight.dequantize().to(grad_output.device), None
+
+
+class NF4LinearWrapper(nn.Module):
+    """
+    NF4 Linear Layer for QLoRA as introduced in `QLORA: Efficient Finetuning of Quantized LLMs <https://arxiv.org/abs/2305.14314>`_.
+    This wrapper module is instantiated in `on_load_checkpoint` and replaces TERowParallelLinear
+    Tensor Parallel is not supported.
+
+    Args:
+        bf16_linear_weight: Weight tensor in BF16 to wrap with NF4Weight
+    """
+
+    def __init__(self, bf16_linear_weight: torch.Tensor):
+        super().__init__()
+
+        # quantize the weight upon initialization
+        self.weight = NF4Weight(bf16_linear_weight).cuda()
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (Tensor): input tensor with shape ``(..., in_dim)``
+
+        Returns:
+            Tensor: output tensor with shape ``(..., out_dim)``
+
+        """
+        return _LinearNF4.apply(x, self.weight), None
+
+
+class NF4LayerNormLinearWrapper(NF4LinearWrapper):
+    """
+    Layernorm + NF4 Linear for QLoRA.
+    This class only combines the two modules for compatibility with TE's LayernormLinear layer, so that
+    the implementation for LoRA and QLoRA can share the same code path.
+    It does NOT fuse the two operations like TE does.
+    This wrapper module is instantiated in `on_load_checkpoint` and replaces TELayerNormColumnParallelLinear
+    Tensor Parallel is not supported.
+
+    Args:
+        bf16_linear_weight: Weight tensor in BF16 to wrap with NF4Weight
+        layer_norm_weight: layernorm weight tensor
+        layer_norm_bias: layernorm bias tensor, only if normalization is LayerNorm
+        normalization: Same as TELayerNormColumnParallelLinear.config.normalization
+        zero_centered_gamma: Same as TELayerNormColumnParallelLinear.config.zero_centered_gamma
+    """
+
+    def __init__(
+        self,
+        bf16_linear_weight: torch.Tensor,
+        layer_norm_weight: torch.Tensor,
+        layer_norm_bias: Optional[torch.Tensor],
+        normalization: str,
+        zero_centered_gamma: bool,
+    ):
+        super().__init__(bf16_linear_weight)
+        self.layer_norm_weight = nn.Parameter(layer_norm_weight)
+        if normalization != "RMSNorm":
+            self.layer_norm_bias = nn.Parameter(layer_norm_bias)
+        else:
+            self.layer_norm_bias = None
+
+        self.zero_centered_gamma = zero_centered_gamma
+        self.normalization = normalization
+        self.layer_norm_fn = self._create_layer_norm_fn()
+        self.te_return_bias = False
+
+    def _create_layer_norm_fn(self):
+        '''
+        create the layernorm function signature in TE. Assume this layer is already running without gradients
+        since this is for QLoRA.
+        '''
+        if self.normalization == 'LayerNorm':
+            from transformer_engine.pytorch.module.layernorm import _LayerNorm
+
+            layer_norm_fn = _LayerNorm.apply
+        elif self.normalization == 'RMSNorm':
+            from transformer_engine.pytorch.module.rmsnorm import _RMSNorm
+
+            layer_norm_fn = _RMSNorm.apply
+        else:
+            raise ValueError("Unsupported normalization type:", self.normalization)
+
+        return layer_norm_fn
+
+    def forward(self, x):
+        layer_norm_args = [
+            x,  # inp
+            self.layer_norm_weight,
+            1e-5,  # eps,
+            0,  # fwd_rmsnorm_sm_margin,
+            0,  # bwd_rmsnorm_sm_margin,
+            self.zero_centered_gamma,
+            True,  # is_grad_enabled,
+            x.dtype,  # activation_dtype,
+        ]
+        if te_version >= packaging.version.Version("1.6"):
+            layer_norm_args.insert(5, 0)  # inf_rmsnorm_sm_margin
+        if self.normalization == "LayerNorm":
+            layer_norm_args.insert(2, self.layer_norm_bias)
+        layernorm_output = self.layer_norm_fn(*layer_norm_args)
+        linear_output = _LinearNF4.apply(layernorm_output, self.weight)
+        return (linear_output, layernorm_output), None
+
+
+def qlora_load_model(model: 'MCoreGPTModel', model_cfg: 'DictConfig', checkpoint: Dict[str, Tensor]):
+    # swap linear layer and cast weight to nf4
+    qlora_targets = [
+        LORA_CONFIG_TO_MCORE_MAP[x] for x in get_target_modules(model_cfg.peft.lora_tuning, default=('all',))
+    ]
+
+    # if not load directly on device, need to load the rest of the model
+    # this block should only load word_embeddings, final_layernorm and output_layer weights.
+    if not model_cfg.get("dist_ckpt_load_on_device", True):
+        checkpoint_state_dict = {}
+        for key, value in checkpoint.items():
+            if not any(qlora_target in key for qlora_target in qlora_targets):
+                checkpoint_state_dict[key.replace('model.', '')] = value
+        model.load_state_dict(checkpoint_state_dict, strict=False)
+
+    def replace_linear(module: nn.Module, prefix=""):
+        for name, child in module.named_children():
+            if name in qlora_targets:
+                bf16_weight = checkpoint[f"{prefix}.{name}.weight"]
+                logging.info(f'QLoRA: Quantizing linear layer: {prefix}.{name}')
+                if name in ['linear_proj', 'linear_fc2']:
+                    setattr(module, name, NF4LinearWrapper(bf16_weight))
+                else:  # name in ['linear_qkv', 'linear_fc1']
+                    layer_norm_weight = checkpoint[f"{prefix}.{name}.layer_norm_weight"]
+                    layer_norm_bias = checkpoint.get(f"{prefix}.{name}.layer_norm_bias", None)
+                    normalization = module.config.normalization
+                    zero_centered_gamma = module.config.layernorm_zero_centered_gamma
+                    setattr(
+                        module,
+                        name,
+                        NF4LayerNormLinearWrapper(
+                            bf16_weight, layer_norm_weight, layer_norm_bias, normalization, zero_centered_gamma
+                        ),
+                    )
+            else:
+                replace_linear(child, prefix=f"{prefix}.{name}")
+
+    replace_linear(model, prefix="model")
diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
index 7edc6720574e..b87c260ca4da 100644
--- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py
+++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -142,7 +142,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                 grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
+            total_norm = grad_norm**norm_type
             if use_fsdp:
                 if len(sharded_grads_for_norm) > 0:
                     sharded_grad_norm, _ = multi_tensor_applier(
@@ -150,20 +150,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                     )
                 else:
                     sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
-                total_sharded_norm = sharded_grad_norm ** norm_type
+                total_sharded_norm = sharded_grad_norm**norm_type
         else:
             for grad in grads_for_norm:
                 grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
+                total_norm += grad_norm**norm_type
             if use_fsdp:
                 for grad in sharded_grads_for_norm:
                     grad_norm = torch.norm(grad, norm_type)
-                    total_sharded_norm += grad_norm ** norm_type
+                    total_sharded_norm += grad_norm**norm_type
 
         if use_fsdp:
             # Sum norm of grad shards across data-parallel GPUs.
             torch.distributed.all_reduce(
-                total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(),
+                total_sharded_norm,
+                op=torch.distributed.ReduceOp.SUM,
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
             )
             total_norm += total_sharded_norm.squeeze()
 
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 5d5b65b360ee..55e386bb22e5 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -76,6 +76,7 @@ def initialize_model_parallel_for_nemo(
     seed=1234,
     apex_transformer_log_level=30,
     use_tp_pp_dp_mapping=False,
+    use_te_rng_tracker=False,
 ):
 
     if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
@@ -128,6 +129,7 @@ def initialize_model_parallel_for_nemo(
     set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size)
     set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank)
 
+    tensor_parallel.random.initialize_rng_tracker(use_te_rng_tracker=use_te_rng_tracker)
     if seed is not None:
         # @chcui not setting seed is for model conversion. always set seed for training/inference.
         _set_random_seed(seed)
@@ -315,7 +317,7 @@ def fake_initialize_model_parallel(
     if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
         for ranks in rank_generator.get_ranks('ep', independent_ep=True):
             if rank in ranks:
-                expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
+                expert_model_parallel_rank = list(ranks).index(rank)
 
     # Build the pipeline model-parallel groups and embedding groups
     # (first and last rank in each pipeline model-parallel group).
diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 48234459453e..5aaac6755601 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -15,13 +15,14 @@
 """Utilities for models."""
 import itertools
 import math
-from typing import Dict, Iterator, List, Tuple, Union
+from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
-
 from torch import Tensor
 
+from nemo.utils import logging, logging_mode
+
 try:
     from apex.normalization import MixedFusedRMSNorm
     from apex.normalization.fused_layer_norm import FusedLayerNorm  # NOQA
@@ -310,9 +311,7 @@ def make_inference_attention_mask_3d(source_block, target_block, pad_id):
 def make_inference_history_mask_3d(block):
     batch, length = block.shape
     arange = torch.arange(length, device=block.device)
-    history_mask = (arange[None,] <= arange[:, None])[
-        None,
-    ]
+    history_mask = (arange[None,] <= arange[:, None])[None,]
     history_mask = history_mask.expand(batch, length, length)
     return history_mask
 
@@ -413,14 +412,67 @@ def get_all_params_for_weight_decay_optimization(
     return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
-def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator:
+def split_list(inputs, num_chunks, enforce_divisible_batch: Optional[bool] = True):
+    """
+    Split a list into equal sized chunks
+    """
+    chunk_size = len(inputs) // num_chunks
+    if enforce_divisible_batch:
+        assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
+    return [inputs[i : i + chunk_size] for i in range(0, len(inputs), chunk_size)]
+
+
+def get_iterator_k_split(
+    batch: Union[Dict, List[torch.Tensor]], num_microbatches: int, enforce_divisible_batch: Optional[bool] = True
+) -> Iterator:
+    """
+    Split a batch into k microbatches, where the batch size is divisible by k. Batch could be
+    a dictionary of tensors or a list of tensors. A dictionary batch could also have items of List type,
+    as long as the length of that list is the same as the batch size.
+    """
     if isinstance(batch, dict):
-        items = list(batch.items())
-        assert items[0][1].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
+        discard_items = [k for k, v in batch.items() if not isinstance(v, (torch.Tensor, list))]
+        if len(discard_items) > 0:
+            logging.warning(
+                f"Only support splitting torch.Tensor and List[torch.Tensor]. Discarding the following keys from the batch: {discard_items}",
+                mode=logging_mode.ONCE,
+            )
+
+        batch = {k: v for k, v in batch.items() if isinstance(v, (torch.Tensor, list))}
+        tensor_items = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        list_items = {k: v for k, v in batch.items() if isinstance(v, list)}
+
+        # Split tensor items
+        items = list(tensor_items.items())
+        if enforce_divisible_batch:
+            assert items[0][1].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
         split_batch = [torch.tensor_split(item[1], num_microbatches, dim=0) for item in items]
-        microbatches = [[(items[i][0], split_batch[i][j]) for i in range(len(items))] for j in range(num_microbatches)]
+        # handle the case where the batch size from dynamic bucketting is not divisible
+        if items[0][1].shape[0] % num_microbatches != 0:
+            chunk_size = split_batch[0][-1].shape[0]
+            split_batch = [[j[:chunk_size] for j in i] for i in split_batch]
+
+        if len(list_items) == 0:
+            # Only have tensor items
+            microbatches = [
+                [(items[i][0], split_batch[i][j]) for i in range(len(items))] for j in range(num_microbatches)
+            ]
+        else:
+            # Split list items
+            list_items = list(list_items.items())
+            split_list_batch = [
+                split_list(item[1], num_microbatches, enforce_divisible_batch=enforce_divisible_batch)
+                for item in list_items
+            ]
+            # Merge tensor and list items
+            all_keys = [item[0] for item in items] + [item[0] for item in list_items]
+            all_split_batch = split_batch + split_list_batch
+            microbatches = [
+                [(all_keys[i], all_split_batch[i][j]) for i in range(len(all_keys))] for j in range(num_microbatches)
+            ]
         microbatches = [dict(elem) for elem in microbatches]
     else:
+        # Split a list of torch tensors
         assert batch[0].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
         split_batch = [
             torch.tensor_split(item, num_microbatches, dim=0) if torch.is_tensor(item) else item for item in batch
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index e29bb3423c4a..e8e2859e439f 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -20,7 +20,7 @@
 from typing import List, Set, Tuple
 
 import torch
-
+from transformers import CLIPImageProcessor
 from nemo.collections.nlp.modules.common.lm_utils import pad_batch
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -69,7 +69,11 @@ def forward_step(self, batch, tensor_shape):
         fwd_bwd_function = get_forward_backward_func()
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -104,7 +108,7 @@ def tokenize_batch(self, sentences, max_len, add_BOS):
 
     @abc.abstractclassmethod
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length
+        """clip the max len based on the LM model max sequence length
         Args:
             maxlen (int): the max len computed from the context and number of tokens to generate
         returns (int):
@@ -119,7 +123,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
            context_length (int): the context token length
            compute_attention_mask: bool: set to True to compute attention mask (not needed for FA)
         Args:
-            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated 
+            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated
         """
         pass
 
@@ -262,7 +266,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -330,10 +334,87 @@ def prepare_batch_at_step(
         return batch, tensor_shape
 
 
+class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
+    def __init__(self, model):
+        super().__init__(model)
+        self.forward_model = self.model.model
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """clip the max len based on the LM model max sequence length"""
+
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+        tokenizer = self.model.tokenizer
+        tokens = context_tokens.contiguous().cuda()
+        # Get the attention mask and postition ids.
+        self.attention_mask, _, self.position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            tokenizer.eos_id,
+            self.model.cfg.get('reset_position_ids', False),
+            self.model.cfg.get('reset_attention_mask', False),
+            self.model.cfg.get('eod_mask_loss', False),
+            compute_attention_mask=compute_attention_mask,
+        )
+        self.attention_mask = None
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_length: int,
+        compute_attention_mask: bool = False,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        """
+        generate the batch used in inference for each of the steps
+        """
+        # types2use = None
+        # Allocate memory for the entire context.
+
+        tokens2use = tokens
+
+        """Prepare batch for each of the inference steps"""
+        attention_mask_repeat = None
+
+        batch = [tokens2use, attention_mask_repeat]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, (tensor_shape, context_length)
+
+    def forward_step(self, batch, tensor_shape_and_context_length):
+        tensor_shape, context_length = tensor_shape_and_context_length
+        fwd_bwd_function = get_forward_backward_func()
+
+        output_tensor = fwd_bwd_function(
+            forward_step_func=self.model.get_forward_output_only_func(),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
+            model=[self.forward_model],
+            num_microbatches=get_num_microbatches(),
+            forward_only=True,
+            seq_length=tensor_shape[0],
+            micro_batch_size=tensor_shape[1],
+        )
+
+        output_tensor[0]['logits'] = output_tensor[0]['logits'][:, :context_length, :]
+        return output_tensor
+
+
 def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, conv_template):
     from nemo.collections.multimodal.data.neva.neva_dataset import (
         DEFAULT_IMAGE_TOKEN,
         preprocess_llama_2,
+        preprocess_llama_3,
         preprocess_multimodal,
         preprocess_nv_dpo,
         preprocess_nvgpt,
@@ -343,20 +424,33 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     list_data_dict = []
     if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]:
         record = {
-            'system': '\n'
-            if multimodal_cfg["conv_template"] == 'nv_dpo'
-            else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
-            'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
+            'system': (
+                '\n'
+                if multimodal_cfg["conv_template"] == 'nv_dpo'
+                else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n'
+            ),
+            'conversations': [
+                {'from': 'User', 'value': prompt},
+                {
+                    'from': 'Assistant',
+                    'value': '',
+                },
+            ],
         }
 
-        for turn in record['conversations']:  #
+        for turn in record['conversations']:
             if turn.get('value') is not None:
                 turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
         list_data_dict.append(record)
 
-        sources = preprocess_multimodal(
-            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
-        )  # HARDCODED FOR NOW
+        # overwrite the media_type in multimodal_cfg to image for image inference using video neva
+        # if the prompt does not contain video, then the media_type is image
+        if list_data_dict[0]['conversations'][0]['value'].find('video') == -1:
+            if multimodal_cfg.get('media_type') is not None and multimodal_cfg.get('num_frames') is not None:
+                multimodal_cfg['media_type'] = 'image'
+                multimodal_cfg['num_frames'] = 1
+
+        sources = preprocess_multimodal(copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents)
         if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm"]:
             data_dict = preprocess_nvgpt(sources, tokenizer, multimodal_cfg)
         else:
@@ -364,7 +458,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
 
     elif multimodal_cfg["conv_template"] == "llama_2":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -376,9 +479,40 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "llama_3":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -399,7 +533,6 @@ class NevaModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
         self.forward_model = self.model.model
-        self.num_media_latents = model.cfg.data.get("image_token_len", 576)
         self.tokenizer = self.model.tokenizer
         self.image_paths = []
         self.cfg = self.model.cfg
@@ -410,17 +543,43 @@ def __init__(self, model):
             is_multimodal=self.data_cfg.is_multimodal,
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
-            image_token_len=self.data_cfg.image_token_len,
-            image_folder=self.data_cfg.image_folder,
+            model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
+            patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim,
+            crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", None),
+            image_folder=self.data_cfg.get('image_folder', None),
+            video_folder=self.data_cfg.get('video_folder', None),
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=None,
             add_extra_token=add_extra_token,
             context_length=self.cfg.encoder_seq_length,
+            media_type=getattr(self.data_cfg, 'media_type', 'image'),
+            num_frames=getattr(self.data_cfg, 'num_frames', 1),
+            mm_mlp_adapter_type=getattr(self.cfg.mm_cfg, 'mm_mlp_adapter_type', 'linear'),
         )
+        if self.multimodal_cfg['crop_size'] is None:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                self.cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+            self.multimodal_cfg['crop_size'] = (
+                image_processor.crop_size['height'],
+                image_processor.crop_size['width'],
+            )
+
+        patch_dim = self.multimodal_cfg['patch_dim']
+        height_num_patches = self.multimodal_cfg['crop_size'][0] // patch_dim
+        width_num_patches = self.multimodal_cfg['crop_size'][1] // patch_dim
+
+        if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+            if height_num_patches % 2 != 0:
+                height_num_patches += 1
+            if width_num_patches % 2 != 0:
+                width_num_patches += 1
+
+        self.num_media_latents = height_num_patches * width_num_patches
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.cfg.encoder_seq_length + 1:
             maxlen = self.model.cfg.encoder_seq_length + 1
         return maxlen
@@ -537,7 +696,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.frozen_model.cfg.encoder_seq_length + 1:
             maxlen = self.model.frozen_model.cfg.encoder_seq_length + 1
         return maxlen
@@ -602,7 +761,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -751,21 +910,21 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
 
                 # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm)
                 if contain_encoder:  # the first cross-attention decoder layer contain encoder
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_chunk_length = inference_retro_chunk_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_chunk_length = (
+                        inference_retro_chunk_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_retrieved_length = (
+                        inference_retro_retrieved_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attn_bda.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].pre_mlp_layernorm.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
                     contain_encoder = False
 
         return context_tokens
@@ -821,6 +980,7 @@ def model_inference_strategy_dispatcher(model, **args):
     from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
         MegatronGPTPromptLearningModel,
     )
+    from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
     from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
     from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     from nemo.collections.nlp.modules.common.retro_inference_strategies import (
@@ -829,6 +989,8 @@ def model_inference_strategy_dispatcher(model, **args):
         RetroQAModelTextGenerationStrategy,
     )
 
+    if isinstance(model, MegatronGriffinModel):
+        return GriffinModelTextGenerationStrategy(model)
     if isinstance(model, MegatronNevaModel):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index d130322404b6..498d9e9a09da 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -151,9 +151,12 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para
 
 def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_params, inference_config, **strategy_args):
 
+    model_type = model.cfg.mm_cfg.llm.get("model_type", "nvgpt")
     conv_template = model.cfg.data.get("conv_template", "nvgpt")
     final_response = []
     for idx, prompt_dict in enumerate(prompt_dict_list):
+        # determine the media type in the prompt_dict
+        media_type_token = inference_config.inference.get("media_type", "image")
         response = generate(
             model,
             inputs=prompt_dict.get('prompt'),
@@ -169,15 +172,25 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             end_strings=sampling_params['end_strings'],
             min_tokens_to_generate=length_params['min_length'],
             compute_attention_mask=sampling_params.get("compute_attention_mask", True),
-            image_list=prompt_dict.get('image'),
+            image_list=prompt_dict.get(media_type_token),
             **strategy_args,
         )
 
+        # Middle stages of PP will return None
+        if response is None:
+            continue
+
         # Regular expression pattern to match the sequence
-        pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
-        pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
+        pattern = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}( ⁇ )+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(r'|', r'\|')
+        )
+        pattern_nvgpt = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}({DEFAULT_IMAGE_PATCH_TOKEN[model_type]})+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(
+                r'|', r'\|'
+            )
+        )
         combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}')
-        clean_text = re.sub(combined_pattern, '<image>', response['sentences'][0])
+        clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0])
 
         clean_response = clean_text
 
@@ -190,9 +203,13 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
                 clean_response = clean_response[last_match_end_position:]
             clean_response = clean_response.strip("<extra_id_1>")
         elif conv_template == 'nv_dpo':
-            clean_response = clean_response.split("<extra_id_1>")[-2][10:]  # [10:] for removing "Assistant\n"
+            clean_response = clean_response.split("<extra_id_1>Assistant\n")[-1]
+            clean_response = clean_response.strip("<extra_id_1>")
         elif conv_template == "llama_2":
             clean_response = clean_response.rsplit("[/INST] ", 1)[-1]
+        elif conv_template == "llama_3":
+            clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1]
+            clean_response = clean_response.rstrip("<|eot_id|>")
         elif conv_template == "v1":
             clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1]
 
@@ -281,17 +298,17 @@ def tab_logits(logits, min_id, max_id, filter_value=-float('Inf')):
 
 def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
     """
-       This function has been mostly taken from huggingface conversational
-         ai code at
-         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-              conversational-ai-with-transfer-learning-2d818ac26313
-
-        @param logits: logits tensor
-        @param top_k: keep only top k tokens with highest probability
-        @param top_p: keep the top tokens with cumulative probability
-        @filter_value: value to set filtered tokens to
-        @started: a tensor of bools indicating whether the text generation starts for the batch
-        returns the filtered logits
+    This function has been mostly taken from huggingface conversational
+      ai code at
+      https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+           conversational-ai-with-transfer-learning-2d818ac26313
+
+     @param logits: logits tensor
+     @param top_k: keep only top k tokens with highest probability
+     @param top_p: keep the top tokens with cumulative probability
+     @filter_value: value to set filtered tokens to
+     @started: a tensor of bools indicating whether the text generation starts for the batch
+     returns the filtered logits
     """
     if top_k > 0:
         # Remove all tokens with a probability less than the
@@ -327,7 +344,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started
 
 
 def repetition_penalty(logits, repetition_penalty, used_tokens):
-    """ Implement the repetition penalty, check paper
+    """Implement the repetition penalty, check paper
     https://arxiv.org/pdf/1909.05858.pdf
     """
     if used_tokens is not None and repetition_penalty != 1.0:
diff --git a/nemo/collections/nlp/modules/common/transformer/text_generation.py b/nemo/collections/nlp/modules/common/transformer/text_generation.py
index a4e37935adc9..5f0275ff4553 100644
--- a/nemo/collections/nlp/modules/common/transformer/text_generation.py
+++ b/nemo/collections/nlp/modules/common/transformer/text_generation.py
@@ -67,47 +67,48 @@ def generate(
             inputs (Union[List[str], Tensor, List[dict]]):
                 Can be one of the 3 types: 
 
-                    1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
-                        E.g [‘sentence’, ‘sentence2’ … ]
+                1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
+                    E.g [‘sentence’, ‘sentence2’ … ]
 
-                    2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder. 
-                        The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
-                        E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
+                2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder.
+                    The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
+                    E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
 
-                    3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
-                        E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
-                        {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
-                        where 'prompt-tag' is used to identify the type of NLP task to solve.
+                3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
+                    E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
+                    {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
+                    where 'prompt-tag' is used to identify the type of NLP task to solve.
 
             length_params (LengthParam):
                 a dictionary type which controls the sampling length.
 
-                    max_length: int, The maximum length of the sequence to be generated.
-
-                    min_length: int,  The minimum length of the sequence to be generated.
+                * max_length: int, The maximum length of the sequence to be generated.
+                * min_length: int,  The minimum length of the sequence to be generated.
 
                 If None, max_length is set to 30, and min_length is set to None
+
             sampling_params (SamplingParam):
                 a dictionary type which contains the parameters for text sampling. It has the following keys
 
-                    use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
-                    top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
-                    top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-                    repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty. 
-                    add_BOS: bool, Whether add the bos token at the begining of the prompt
-                    all_probs: bool  # whether return the log prob for all the tokens in vocab
-                    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-                    end_strings: List[str]  # generation will stop when one of these tokens is generated
+                * use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
+                * top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
+                * top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+                * repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty.
+                * add_BOS: bool, Whether add the bos token at the begining of the prompt
+                * all_probs: bool  # whether return the log prob for all the tokens in vocab
+                * compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+                * end_strings: List[str]  # generation will stop when one of these tokens is generated
+
                 Default None, If it is None, use_greedy will be "True".
 
         Returns:
-            OutputType: It generates the output in a dictionary type. It has the following keys:
-
-                sentences: List[str], output sentences
-                tokens: List[List[str]], output sentences borken into tokens
-                logprob: List[List[float]],  log prob of generated tokens
-                full_logprob: List[List[float]], log prob of all the tokens in the vocab
-                token_ids: List[List[int]], output sentence token ids
-                offsets: List[List[int]]  # list of tokens start positions in text
+            It generates the output in a dictionary type. It has the following keys,
+
+            * sentences: List[str], output sentences
+            * tokens: List[List[str]], output sentences borken into tokens
+            * logprob: List[List[float]],  log prob of generated tokens
+            * full_logprob: List[List[float]], log prob of all the tokens in the vocab
+            * token_ids: List[List[int]], output sentence token ids
+            * offsets: List[List[int]]  # list of tokens start positions in text
         """
         raise NotImplementedError("please implement this method")
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 6b9763a53414..194168008dc4 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import sys
-from typing import Union
+from typing import Optional, Union
 
+from lightning_fabric.utilities.exceptions import MisconfigurationException
 from omegaconf import DictConfig
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
@@ -31,6 +32,11 @@
     PipelineMixedPrecisionPlugin,
 )
 from nemo.utils import logging
+from nemo.utils.callbacks.dist_ckpt_io import (
+    AsyncFinalizableCheckpointIO,
+    AsyncFinalizerCallback,
+    DistributedCheckpointIO,
+)
 
 
 class MegatronTrainerBuilder:
@@ -50,11 +56,15 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
         _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive)
         if _IS_INTERACTIVE and self.cfg.trainer.devices == 1:
             logging.info("Detected interactive environment, using NLPDDPStrategyNotebook")
-            return NLPDDPStrategyNotebook(no_ddp_communication_hook=True, find_unused_parameters=False,)
+            return NLPDDPStrategyNotebook(
+                no_ddp_communication_hook=True,
+                find_unused_parameters=False,
+            )
 
         if self.cfg.model.get('fsdp', False):
             assert (
                 not self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+                and not self.cfg.model.optim.get('name') == 'mcore_distributed_optim'
             ), 'Distributed optimizer cannot be used with FSDP.'
             sharded_checkpoint = self.cfg.model.get('fsdp_sharded_checkpoint', False)
             if self.cfg.model.get('tensor_model_parallel_size', 1) > 1:
@@ -80,7 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
-            torch_dist_ckpt=self.cfg.model.get('torch_distributed_checkpoint', False),
+            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False),
         )
 
     def _grad_scaler(self) -> GradScaler:
@@ -88,7 +98,7 @@ def _grad_scaler(self) -> GradScaler:
         Returns a scaler for precision plugins.
         """
         return GradScaler(
-            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
             hysteresis=self.cfg.model.get('hysteresis', 2),
         )
@@ -100,7 +110,12 @@ def _plugins(self) -> list:
         """
         megatron_amp_O2 = self.cfg.model.get('megatron_amp_O2', False)
         with_distributed_adam = (
-            self.cfg.model.optim.get('name') == 'distributed_fused_adam' if self.cfg.model.get('optim') else False
+            (
+                self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+                or self.cfg.model.optim.get('name') == 'mcore_distributed_optim'
+            )
+            if self.cfg.model.get('optim')
+            else False
         )
 
         plugins = []
@@ -127,16 +142,45 @@ def _plugins(self) -> list:
         if self.cfg.get('cluster_type', None) == 'BCP':
             plugins.append(TorchElasticEnvironment())
 
+        # Use dist-ckt for non-FSDP MCore models
+        use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
+            self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
+        )
+        async_save = self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False)
+        if use_dist_ckpt:
+            checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save)
+            if async_save:
+                checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io)
+            plugins.append(checkpoint_io)
+        elif async_save:
+            raise MisconfigurationException(
+                'exp_manager.checkpoint_callback_params.async_save=True without'
+                'distributed checkpoints is currently not supported'
+            )
+
         return plugins
 
+    def _callbacks(self, callbacks: Optional[list]) -> list:
+        """
+        Returns:
+            callbacks: list of callbacks passed to Trainer.callbacks.
+        """
+        if callbacks is None:
+            callbacks = []
+        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
+            callbacks.append(CustomProgressBar())
+
+        if self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False):
+            callbacks.append(AsyncFinalizerCallback())
+        return callbacks
+
     def create_trainer(self, callbacks=None) -> Trainer:
         # cfg.trainer.precision becomes None in Trainer if precision_plugins exist since both precision plugins and precision
         precision = self.cfg.trainer.precision
         strategy = self._training_strategy()
         plugins = self._plugins()
-        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
-            callbacks = [CustomProgressBar()]
+        callbacks = self._callbacks(callbacks)
         trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
         # Restore the precision value after Trainer is built.
         self.cfg.trainer.precision = precision
@@ -148,7 +192,7 @@ class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
 
     def _grad_scaler(self) -> GradScaler:
         return GradScaler(
-            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
         )
 
@@ -156,13 +200,15 @@ def _grad_scaler(self) -> GradScaler:
 class MegatronT5TrainerBuilder(MegatronTrainerBuilder):
     """Builder for T5 model Trainer with overrides."""
 
-    def create_trainer(self) -> Trainer:
+    def _callbacks(self, callbacks: Optional[list]) -> list:
+        callbacks = super()._callbacks(callbacks)
+        callbacks.append(ModelSummary(max_depth=3))
+        return callbacks
+
+    def create_trainer(self, callbacks=None) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
-        callbacks = [ModelSummary(max_depth=3)]
-        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
-            callbacks.append(CustomProgressBar())
+        callbacks = self._callbacks(callbacks)
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 
 
@@ -194,7 +240,7 @@ class MegatronLMPPTrainerBuilder(MegatronTrainerBuilder):
 
     def _grad_scaler(self) -> GradScaler:
         return GradScaler(
-            init_scale=self.cfg.model.get("native_amp_init_scale", 2 ** 32),
+            init_scale=self.cfg.model.get("native_amp_init_scale", 2**32),
             growth_interval=self.cfg.model.get("native_amp_growth_interval", 1000),
             hysteresis=self.cfg.model.get("hysteresis", 2),
             enabled=False if self.cfg.model.pipeline_model_parallel_size > 1 else True,
diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 5da7296519cb..00552cb7f96e 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -12,26 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 from typing import List, Optional, Union
 
 import torch
-from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
-from nemo.collections.nlp.parts.peft_config import (
-    PEFT_CONFIG_MAP,
-    CanonicalAdaptersPEFTConfig,
-    LoraPEFTConfig,
-    PEFTConfig,
-    PtuningPEFTConfig,
-)
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig, PtuningPEFTConfig
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
-from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
     from megatron.core import parallel_state
@@ -46,7 +35,9 @@ class MultimodalAdapterModelMixin(NLPAdapterModelMixin):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         # TODO (yuya): p-tuning need additional handle, check peft models.
         """
         Returns all the keys in the model
@@ -57,35 +48,50 @@ def _get_all_keys(self,):
         return set(k)
 
     def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None):
+            raise ValueError('Virtual pipeline model parallel is not supported when using PEFT')
+        if self.cfg.optim.name == "distributed_fused_adam":
+            raise ValueError('distributed_fused_adam is not supported for PEFT. Please use fused_adam')
+
+        self.use_peft = True
         if not isinstance(peft_cfgs, List):
             peft_cfgs = [peft_cfgs]
 
+        # @chcui crucial to set self.virtual_tokens and self.use_peft for all PP ranks
+        for peft_cfg in peft_cfgs:
+            if isinstance(peft_cfg, PtuningPEFTConfig):
+                self.virtual_tokens = peft_cfg.virtual_tokens
+        ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
+        self.ptuning_only_and_non_first_stage = ptuning_only and not self.first_stage_of_pipeline()
+        if self.ptuning_only_and_non_first_stage:
+            # There are no params to add if we are not in the first state of the pipeline
+            return
+
         self.base_keys = getattr(self, "base_keys", self._get_all_keys())
         logging.info(f"Before adding PEFT params:\n{self.summarize()}")
 
-        self.use_ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
-
         for peft_cfg in peft_cfgs:
-            if self.use_ptuning_only:
-                if not self.first_stage_of_pipeline():
-                    # There are no params to add if we are not in the first state of the pipeline
-                    continue
-                self.virtual_tokens = peft_cfg.virtual_tokens
-
             self._check_and_add_peft_cfg(peft_cfg)
 
         logging.info(f"After adding PEFT params:\n{self.summarize()}")
         self.adapter_keys = self._get_all_keys() - self.base_keys
-        if self.megatron_amp_O2:
-            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
+        self.tunable_base_param_keys = set()
 
         for cfg in peft_cfgs:
-            if cfg.weight_tying:
+            if hasattr(cfg, "weight_tying") and cfg.weight_tying:
                 self.tie_weights(cfg)
-        self.use_peft = True
+
+            if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
+                self.set_tunable_base_params(cfg)
+
+        if self.megatron_amp_O2:
+            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
@@ -110,22 +116,27 @@ def load_adapters(
             else:
                 map_location = 'cpu'
 
-        if filepath.endswith('.nemo'):
-            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
-        elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
-        else:
-            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        # TODO (yuya): this logic needs to change for dist ckpt because after
+        # adding adapaters the checkpoint will change
         if not peft_cfgs:
             assert filepath.endswith(
                 '.nemo'
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
         self.add_adapter(peft_cfgs)
-        assert set(state_dict.keys()) == self.adapter_keys
-
-        if self.megatron_amp_O2:
-            state_dict = {k.replace("model.", "model.module.", 1): v for k, v in state_dict.items()}
+        if filepath.endswith('.nemo'):
+            sharded_state_dict = None
+            if getattr(self, "sharded_state_dict", None) is not None:
+                sharded_state_dict = self.sharded_state_dict(prefix="model.")
+            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict)
+        elif filepath.endswith('.ckpt'):
+            state_dict = torch.load(filepath, map_location)['state_dict']
+        else:
+            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        if not self.ptuning_only_and_non_first_stage:
+            assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
 
         missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False)
 
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 123f0f06a33d..0b0158447554 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -30,6 +30,7 @@
 
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import PromptEncoderAdapterConfig
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import (
     PEFT_CONFIG_MAP,
     CanonicalAdaptersPEFTConfig,
@@ -38,11 +39,13 @@
     PtuningPEFTConfig,
 )
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
 
 try:
-    from megatron.core import parallel_state
+    from megatron.core import dist_checkpointing, parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
 except (ImportError, ModuleNotFoundError):
     HAVE_MEGATRON_CORE = False
 
@@ -56,7 +59,7 @@ def replace_prefix(name, old_prefix, new_prefix):
 
 
 class NLPAdapterModelMixin:
-    """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
+    """NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
     This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule.
     This mixin class adds several utility methods to add, load and save adapters.
 
@@ -92,7 +95,9 @@ def first_stage_of_pipeline(self):
         logging.warning("no attribute named model or no model.pre_process found. Can not detect stage of pipeline...")
         return False
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         """
         Returns all the keys in the model
         """
@@ -216,15 +221,18 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
             if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
                 self.set_tunable_base_params(cfg)
 
-    def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
+    def _get_config_and_state_dict_from_nemo(self, filepath, map_location, sharded_state_dict=None):
         cwd = os.getcwd()
+        save_restore_connector = NLPSaveRestoreConnector()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             try:
-                SaveRestoreConnector._unpack_nemo_file(filepath, tmpdir)
+                if os.path.isfile(filepath):
+                    save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
+                else:
+                    tmpdir = filepath
 
                 os.chdir(tmpdir)
-
                 config_yaml = "model_config.yaml"
                 model_weights_ckpt = "model_weights.ckpt"
 
@@ -233,7 +241,22 @@ def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
                 os.chdir(cwd)
                 model_weights = os.path.join(tmpdir, model_weights_ckpt)
                 model_weights = inject_model_parallel_rank(model_weights)
-                state_dict = torch.load(model_weights, map_location=map_location)
+                state_dict = save_restore_connector._load_state_dict_from_disk(
+                    model_weights, map_location=map_location
+                )
+
+                # distributed checkpointing
+                if state_dict is None and sharded_state_dict is not None:
+                    checkpoint = dict(state_dict=sharded_state_dict)
+                    tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
+                    tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                    assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                    checkpoint = dist_checkpointing.load(
+                        sharded_state_dict=checkpoint,
+                        checkpoint_dir=tmp_model_weights_dir,
+                    )
+                    state_dict = checkpoint["state_dict"]
+
                 return conf, state_dict
             finally:
                 os.chdir(cwd)
@@ -271,7 +294,10 @@ def setup_optimizer_param_groups(self):
             super().setup_optimizer_param_groups()
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
@@ -438,7 +464,15 @@ def on_load_checkpoint(self, checkpoint) -> None:
                             self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
                         parallel_state.set_virtual_pipeline_model_parallel_rank(0)
         else:
-            super().on_load_checkpoint(checkpoint)
+            cfg_peft = self.cfg.get('peft', None)
+            if cfg_peft and cfg_peft['peft_scheme'] == 'qlora':
+                from nemo.collections.nlp.modules.common.megatron.adapters.qlora import qlora_load_model
+
+                qlora_load_model(
+                    self.model.module if self.megatron_amp_O2 else self.model, self.cfg, checkpoint['state_dict']
+                )
+            else:
+                super().on_load_checkpoint(checkpoint)
 
     @classmethod
     def merge_cfg_with(cls, path: str, cfg: DictConfig) -> DictConfig:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index a6f68f0666b5..8ca010e59f70 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -25,6 +25,7 @@
 
 import pytorch_lightning as pl
 import torch
+from lightning_fabric.plugins import TorchCheckpointIO
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.optimizer import _optimizer_to_device
 from omegaconf import OmegaConf
@@ -34,6 +35,7 @@
 from pytorch_lightning.loops.fetchers import _DataFetcher
 from pytorch_lightning.plugins import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp import FSDPPrecision
 from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
@@ -54,13 +56,14 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel
 
+from nemo.utils.get_rank import is_global_rank_zero
+
 try:
     from torch.cuda.amp.grad_scaler import _refresh_per_optimizer_state
 except ImportError:
     # since PyTorch 2.3 the path has changed
     from torch.amp.grad_scaler import _refresh_per_optimizer_state
 
-from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
 from nemo.collections.nlp.parts import utils_funcs
@@ -68,12 +71,13 @@
 from nemo.core.optim import MainParamsOptimizerWrapper
 from nemo.core.optim.optimizers import init_optimizer_states
 from nemo.utils import AppState, logging
-from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
+    from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 
     HAVE_APEX = True
 
@@ -94,6 +98,7 @@
 try:
     from megatron.core import dist_checkpointing, parallel_state
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
+    from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject
     from megatron.core.dist_checkpointing.optimizer import (
         get_param_id_to_sharded_param_map,
         make_sharded_optimizer_tensor,
@@ -103,6 +108,7 @@
     from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
+    from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
 
     HAVE_MEGATRON_CORE = True
 
@@ -116,7 +122,7 @@
 def init_model_parallel(
     sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30
 ) -> None:
-    """ Initializes Megatron-LM model parallel if using model parallelism.
+    """Initializes Megatron-LM model parallel if using model parallelism.
 
     Args:
         sharp: Apply SHARP to NCCL data-parallel communication.
@@ -160,7 +166,7 @@ def init_model_parallel(
 
 
 class NLPDDPStrategy(DDPStrategy):
-    """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
+    """DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
 
     Args:
         no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
@@ -177,7 +183,7 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
-        torch_dist_ckpt: bool = False,
+        dist_ckpt_parallel_save: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -194,7 +200,7 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
-        self.torch_dist_ckpt = torch_dist_ckpt
+        self._dist_ckpt_parallel_save = dist_ckpt_parallel_save
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -229,14 +235,16 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) ->
                 )
 
     def configure_ddp(self):
-        """ Override LightningModule ddp if using model parallel.
-            Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
+        """Override LightningModule ddp if using model parallel.
+        Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
         """
 
         if (hasattr(self.model, 'megatron_amp_O2') and self.model.megatron_amp_O2) or (
             hasattr(self.model, 'with_distributed_adam') and self.model.with_distributed_adam
         ):
             # do not use DDP if using megatron amp O2 or distributed optimizer
+            if self.model.use_mcore_dist_optim:
+                self.model.setup_mcore_distributed_parallel()
             self._model = self.model
         else:
             app_state = AppState()
@@ -270,7 +278,7 @@ def configure_ddp(self):
             else:
                 super().configure_ddp()
 
-    def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
+    def optimizer_sharded_state_dict(self, unsharded_optim_state=None, is_loading=False):
         """
         Sharded state dictionary for an MainParamsOptimizerWrapper.
         Used to save and load the optimizer state when training with distributed_checkpoint.
@@ -288,8 +296,14 @@ def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
         model_sharded_state_dict = {
             key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state')
         }
-
-        if isinstance(optimizer, MegatronDistributedFusedAdam):
+        if isinstance(optimizer, McoreDistributedOptimizer):
+            return optimizer.sharded_state_dict(
+                model_sharded_state_dict,
+                unsharded_optim_state,
+                is_loading=is_loading,
+                dist_ckpt_parallel_save=self._dist_ckpt_parallel_save,
+            )
+        elif isinstance(optimizer, MegatronDistributedFusedAdam):
             return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state)
         elif not isinstance(optimizer, MainParamsOptimizerWrapper):
             # Regular optimizer, e.g. Adam or FusedAdam
@@ -349,10 +363,7 @@ def save_checkpoint(
             called on every rank and internally does the rank checking.
         """
         # check if using distributed checkpointing
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
+        if self.use_distributed_checkpointing:
             assert (
                 len(checkpoint['optimizer_states']) == 1
             ), "Currently only support checkpointing 1 distributed optimizer per time!"
@@ -361,25 +372,10 @@ def save_checkpoint(
                 unsharded_optim_state=checkpoint['optimizer_states'][0]
             )
             checkpoint['optimizer_states'] = [sharded_optim_state]
-            # dist_checkpointing expects a directory so we will name the directory
-            # using the path with the file extension removed
-            checkpoint_dir = ckpt_to_dir(filepath)
-
-            fs = get_filesystem(checkpoint_dir)
-            if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
-                logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
-                return
-
-            if is_global_rank_zero():
-                fs.makedirs(checkpoint_dir, exist_ok=True)
-
             # remove device state_dict
             checkpoint['state_dict'] = OrderedDict([])
 
-            sharded_strategy = ('torch_dist', 1) if self.torch_dist_ckpt else ('zarr', 1)
-            dist_checkpointing.save(
-                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir, sharded_strategy=sharded_strategy
-            )
+            self.checkpoint_io.save_checkpoint(checkpoint, ckpt_to_dir(filepath), storage_options=storage_options)
         else:
             # PTL override to accomodate model parallel checkpoints
             filepath = inject_model_parallel_rank(filepath)
@@ -389,10 +385,7 @@ def save_checkpoint(
     # PTL 2.2 supports non strict loading of the ckpt with the strict arg (https://github.com/Lightning-AI/pytorch-lightning/pull/19404)
     def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         # if using distributed checkpointing, the state dict logic is at the model level
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
+        if self.use_distributed_checkpointing:
             return
 
         # legacy state dict logic, does not use megatron core
@@ -420,7 +413,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
             self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     def _fix_tensors_device(self, ckpt: Dict) -> Dict:
-        """ Ensure checkpoint tensors are on the correct device."""
+        """Ensure checkpoint tensors are on the correct device."""
         assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
         cur_dev = torch.device("cuda", index=torch.cuda.current_device())
 
@@ -431,21 +424,81 @@ def _fix_device(t):
 
         return dict_list_map_outplace(_fix_device, ckpt)
 
+    def _get_param_group(self, state_dict: Dict[str, Any]):
+        """Return the param groups in the state dict"""
+        return (
+            state_dict['optimizer_states'][0]['param_groups']
+            if 'optimizer' not in state_dict['optimizer_states'][0]
+            else state_dict['optimizer_states'][0]['optimizer']['param_groups']
+        )
+
+    def _check_param_groups_mismatch(self, checkpoint_path: Union[str, Path], sharded_state_dict: Dict[str, Any]):
+        """
+        Check if the number of param groups in the checkpoint not match with the sharded_state_dict
+        Returns:
+            bool: True if the number of param groups does not match
+        """
+        common_state_dict = dist_checkpointing.load_common_state_dict(checkpoint_path)
+        model_param_groups = self._get_param_group(common_state_dict)
+        checkpoint_param_groups = self._get_param_group(sharded_state_dict)
+        return len(model_param_groups) != len(checkpoint_param_groups)
+
+    def _fix_param_groups(
+        self, checkpoint_path: Union[str, Path], sharded_state_dict: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Try to fix the param groups in the checkpoint.
+        This is to fix the bug that in 24.03, all checkpoints store EP param group regardless of using EP or not.
+        This function makes sure all checkpoints are compatible for loading.
+        Returns:
+            sharded_state_dict: Loaded dictionary for the distributed load function
+        """
+        common_state_dict = dist_checkpointing.load_common_state_dict(checkpoint_path)
+        model_param_groups = self._get_param_group(sharded_state_dict)
+        checkpoint_param_groups = self._get_param_group(common_state_dict)
+
+        model_has_expert_param = any(param.get('is_expert', False) for param in model_param_groups)
+        checkpoint_has_expert_param = any(param.get('is_expert', False) for param in checkpoint_param_groups)
+
+        expert_index = None
+        if checkpoint_has_expert_param and not model_has_expert_param:
+            logging.warning(
+                'Currently training the model without expert parallelism while restored checkpoint has EP params. Ignoring the EP params for restoring.'
+            )
+            expert_index = next(
+                (index for index, entry in enumerate(checkpoint_param_groups) if entry.get('is_expert', False)),
+                None,
+            )
+            if expert_index:
+                # Temporary empty params so that loading doesn't fail
+                model_param_groups.insert(expert_index, {'params': LocalNonpersitentObject([]), 'is_expert': True})
+                if 'optimizer' in sharded_state_dict['optimizer_states'][0]:
+                    sharded_state_dict['optimizer_states'][0]['optimizer']['param_groups'] = model_param_groups
+                else:
+                    sharded_state_dict['optimizer_states'][0]['param_groups'] = model_param_groups
+            else:
+                raise ValueError('Cannot find expert param in the checkpoint.')
+
+        loaded_state_dict = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
+        if expert_index is not None:
+            # Remove the temporary empty params added above
+            if 'optimizer' in loaded_state_dict['optimizer_states'][0]:
+                loaded_state_dict['optimizer_states'][0]['optimizer']['param_groups'].pop(expert_index)
+            else:
+                loaded_state_dict['optimizer_states'][0]['param_groups'].pop(expert_index)
+        return loaded_state_dict
+
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ PTL method which we override to integrate distributed checkpoints for model parallel models.
-            In order to load distributed checkpoints we need to provide the sharded_state_dict to 
-            the distributed load function. We get the sharded_state_dict from self.lightning_module
-            which makes it convenient to have the loading logic happen at the strategy level.
+        """PTL method which we override to integrate distributed checkpoints for model parallel models.
+        In order to load distributed checkpoints we need to provide the sharded_state_dict to
+        the distributed load function. We get the sharded_state_dict from self.lightning_module
+        which makes it convenient to have the loading logic happen at the strategy level.
         """
 
         fs = get_filesystem(checkpoint_path)
 
         # Check if using distributed checkpointing
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
-
+        if self.use_distributed_checkpointing:
             # Distributed checkpoints must be directories.
             if not fs.isdir(checkpoint_path):
                 raise ValueError(f'Distributed checkpoints should be a directory. Found: {checkpoint_path}.')
@@ -456,17 +509,11 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
             # after dist_checkpointing.load, sharded tensors will be replaced with tensors
             checkpoint['state_dict'] = sharded_state_dict
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
-
-            if self.torch_dist_ckpt:
-                sharded_strategy = ('torch_dist', 1)
-            else:
-                sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
-            checkpoint = dist_checkpointing.load(
-                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_path, sharded_strategy=sharded_strategy
-            )
+            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict(is_loading=True)]
 
-            return checkpoint
+            if self._check_param_groups_mismatch(checkpoint_path, checkpoint):
+                return self._fix_param_groups(checkpoint_path, checkpoint)
+            return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint)
 
         # Legacy model parallel checkpointing logic, does not use megatron core
         else:
@@ -479,12 +526,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
         # check if filepath is a distributed checkpoint
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
+        if self.use_distributed_checkpointing:
             if self.is_global_zero:
-                shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True)
+                self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
 
         # legacy checkpoint logic, does not use megatron core
         else:
@@ -495,6 +539,28 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
                 logging.info(f'Removing checkpoint: {filepath}')
                 self.checkpoint_io.remove_checkpoint(filepath)
 
+    @property
+    def use_distributed_checkpointing(self):
+        checkpoint_io = self.checkpoint_io
+        while isinstance(checkpoint_io, _WrappingCheckpointIO):
+            checkpoint_io = checkpoint_io.checkpoint_io
+        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(checkpoint_io, DistributedCheckpointIO)
+        has_sharded_state_dict = (
+            hasattr(self.lightning_module, 'sharded_state_dict')
+            and self.lightning_module.sharded_state_dict() is not None
+        )
+        if has_sharded_state_dict and not has_dist_ckpt_io:
+            logging.warning(
+                'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.'
+            )
+            self.checkpoint_io = DistributedCheckpointIO.from_config(self.lightning_module.cfg)
+        if not has_sharded_state_dict and has_dist_ckpt_io:
+            logging.warning(
+                'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO'
+            )
+            self.checkpoint_io = TorchCheckpointIO()
+        return has_sharded_state_dict
+
     @property
     def distributed_sampler_kwargs(self):
         app_state = AppState()
@@ -512,15 +578,15 @@ def distributed_sampler_kwargs(self):
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ This needs to be True for distributed checkpointing because
-            we require the model to have configured the optimizer before 
-            deserializing the checkpoint.
+        """This needs to be True for distributed checkpointing because
+        we require the model to have configured the optimizer before
+        deserializing the checkpoint.
         """
         return True
 
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
-    """ Version of NLPDDPStrategy to be used in a Jupyter Notebook
+    """Version of NLPDDPStrategy to be used in a Jupyter Notebook
     A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
     single-GPU training (e.g. in a Jupyter notebook)
     A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
@@ -558,7 +624,7 @@ def _get_full_state_dict_context(module: torch.nn.Module, rank0_only: bool = Fal
 
 
 class NLPFSDPStrategy(FSDPStrategy):
-    """ FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
+    """FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
 
     Args:
         sharding_strategy: FSDP parameter sharding strategy.
@@ -595,6 +661,9 @@ def __init__(
         # Use the default FSDP backward-prefetch policy for proper communication overlap.
         kwargs['backward_prefetch'] = BackwardPrefetch.BACKWARD_PRE
 
+        # import here to prevent circular imports
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
+
         # Set FSDP wrapping policy: use Transformer layer module as the FSDP sharding granularity.
         self.fsdp_wrap_module = {
             MCoreTransformerLayer,
@@ -651,7 +720,11 @@ def _set_mixed_precision_recipe(
             reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None)
         if set_buffer_dtype is not None:
             buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None)
-        return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,)
+        return MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype,
+        )
 
     def setup_environment(self) -> None:
         """
@@ -668,8 +741,8 @@ def setup_environment(self) -> None:
                     app_state.tensor_model_parallel_size == 1
                 ), "FSDP hybrid sharding cannot be used when tensor_model_parallel_size > 1."
             init_model_parallel(self.sharp, self.nccl_communicator_config_path)
-            # Set the FSDP process group as DP process group
-            self._process_group = parallel_state.get_data_parallel_group()
+        # Set the FSDP process group as DP(+CP) process group
+        self.kwargs["process_group"] = parallel_state.get_data_parallel_group(with_context_parallel=True)
 
         # Set the params to omit from sharding.
         self.kwargs["ignored_states"] = []
@@ -762,7 +835,9 @@ def _get_osd(opt_state):
                         with FSDP.summon_full_params(self.model, writeback=True, rank0_only=False):
                             # rekey the osd stored from non-FSDP model
                             rekeyed_osd = FSDP.rekey_optim_state_dict(
-                                temp_osd, OptimStateKeyType.PARAM_NAME, self.model,
+                                temp_osd,
+                                OptimStateKeyType.PARAM_NAME,
+                                self.model,
                             )
                         temp_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, self.model)
                     except Exception as e:
@@ -770,7 +845,9 @@ def _get_osd(opt_state):
                         exit(1)
                 # Shard optimizer state dict
                 sharded_osd = FSDP.optim_state_dict_to_load(
-                    optim_state_dict=temp_osd, model=self.model, optim=optimizer,
+                    optim_state_dict=temp_osd,
+                    model=self.model,
+                    optim=optimizer,
                 )
 
                 optimizer.load_state_dict(sharded_osd)
@@ -779,9 +856,9 @@ def _get_osd(opt_state):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        """ Store checkpoints
-            1. In case of sharded checkpoint, all ranks store unique checkpoints.
-            2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
+        """Store checkpoints
+        1. In case of sharded checkpoint, all ranks store unique checkpoints.
+        2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
         """
         app_state = AppState()
         filepath = inject_model_parallel_rank(filepath, fsdp_sharded_ckpt=self.sharded_checkpoint)
@@ -792,8 +869,7 @@ def save_checkpoint(
             self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ Load checkpoints
-        """
+        """Load checkpoints"""
         # 1. Load normal or FSDP-sharded checkpoints.
         fs = get_filesystem(checkpoint_path)
 
@@ -810,8 +886,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         return checkpoint
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
-        """ Remove checkpoints
-        """
+        """Remove checkpoints"""
         # legacy checkpoint logic, does not use megatron core
         app_state = AppState()
         # PTL override to accomodate model parallel checkpoints
@@ -826,9 +901,9 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
-            FSDP sharding to match FSDP-sharded format between the checkpoint and the current
-            model and optimizer.
+        """When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
+        FSDP sharding to match FSDP-sharded format between the checkpoint and the current
+        model and optimizer.
         """
         return True
 
@@ -868,32 +943,19 @@ def save_to(self, model, save_path: str):
             if dist_ckpt:
                 # model weights is a directory
                 dist_ckpt_dir = ckpt_to_dir(os.path.join(dir_name, self.model_weights_ckpt))
-                fs = get_filesystem(dist_ckpt_dir)
 
-                if fs.isdir(dist_ckpt_dir) and dist_checkpointing.check_is_distributed_checkpoint(dist_ckpt_dir):
-                    logging.info(f'Distributed checkpoint at path {dist_ckpt_dir} already exists, skipping saving')
-                else:
-                    if is_global_rank_zero():
-                        fs.makedirs(dist_ckpt_dir, exist_ok=True)
+                sharded_state_dict = model.sharded_state_dict()
+                # dist checkpoint needs torch.distributed to save the checkpoint
+                if not parallel_state.is_initialized():
 
-                    sharded_state_dict = model.sharded_state_dict()
-                    # dist checkpoint needs torch.distributed to save the checkpoint
-                    if not parallel_state.is_initialized():
+                    def dummy():
+                        return
 
-                        def dummy():
-                            return
-
-                        if model.trainer.strategy.launcher is not None:
-                            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-                        model.trainer.strategy.setup_environment()
-                    sharded_strategy = (
-                        ('torch_dist', 1) if model.cfg.get("torch_distributed_checkpoint", False) else ('zarr', 1)
-                    )
-                    dist_checkpointing.save(
-                        sharded_state_dict=sharded_state_dict,
-                        checkpoint_dir=dist_ckpt_dir,
-                        sharded_strategy=sharded_strategy,
-                    )
+                    if model.trainer.strategy.launcher is not None:
+                        model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+                    model.trainer.strategy.setup_environment()
+                checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
             else:
 
@@ -940,7 +1002,8 @@ def dummy():
                     else:
                         # move weights to the tmpdir
                         for tp_rank, pp_rank in itertools.product(
-                            range(app_state.tensor_model_parallel_size), range(app_state.pipeline_model_parallel_size),
+                            range(app_state.tensor_model_parallel_size),
+                            range(app_state.pipeline_model_parallel_size),
                         ):
                             os.makedirs(os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}'))
                             mp_model_weights = os.path.join(
@@ -1025,6 +1088,7 @@ def modify_state_dict(self, conf, state_dict):
         loaded_keys = state_dict.keys()
         if 'model.model.diffusion_model.input_blocks.1.0.in_layers.2.weight' in loaded_keys:
             new_state_dict = {}
+
             # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following
             def should_process(key):
                 base_str = "model.model.diffusion_model."
@@ -1057,6 +1121,31 @@ def should_process(key):
                     new_state_dict[key_] = state_dict[key_]
             state_dict = new_state_dict
 
+        if conf.get('unet_config') and conf.get('unet_config').get('use_te_fp8') == False:
+            # Mapping potential fp8 ckpt to fp16 model
+            # remove _extra_state in fp8 if there is.
+            new_state_dict = {}
+            for key in state_dict.keys():
+                if 'extra_state' in key:
+                    continue
+
+                ### LayerNormLinear
+                # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
+                # norm_to_q.weight -> to_q.weight
+                new_key = key.replace('norm_to_q.layer_norm_', 'norm.')
+                new_key = new_key.replace('norm_to_q.weight', 'to_q.weight')
+
+                ### LayerNormMLP
+                # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
+                # ff.net.fc1_{weight|bias} -> ff.net.1.proj.{weight|bias}
+                # ff.net.fc2_{weight|bias} -> ff.net.3.{weight|bias}
+                new_key = new_key.replace('ff.net.layer_norm_', 'ff.net.0.')
+                new_key = new_key.replace('ff.net.fc1_', 'ff.net.1.proj.')
+                new_key = new_key.replace('ff.net.fc2_', 'ff.net.3.')
+
+                new_state_dict[new_key] = state_dict[key]
+            state_dict = new_state_dict
+
         return state_dict
 
     def _load_state_dict_from_disk(self, model_weights, map_location=None):
@@ -1110,7 +1199,13 @@ def restore_from(
         # Get path where the command is executed - the artifacts will be "retrieved" there
         # (original .nemo behavior)
         loaded_params = super().load_config_and_state_dict(
-            calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer,
+            calling_cls,
+            restore_path,
+            override_config_path,
+            map_location,
+            strict,
+            return_config,
+            trainer,
         )
         if not isinstance(loaded_params, tuple) or return_config is True:
             return loaded_params
@@ -1151,8 +1246,9 @@ def dummy():
                 tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
-                checkpoint = dist_checkpointing.load(
-                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir
+                checkpoint_io = DistributedCheckpointIO.from_config(conf)
+                checkpoint = checkpoint_io.load_checkpoint(
+                    tmp_model_weights_dir, sharded_state_dict=checkpoint, strict=strict
                 )
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):
@@ -1166,12 +1262,12 @@ def dummy():
 
 
 class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1207,12 +1303,12 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class FSDPMixedPrecisionPlugin(FSDPPrecision):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1247,7 +1343,7 @@ class GradScaler(torch.cuda.amp.GradScaler):
 
     def __init__(
         self,
-        init_scale=2.0 ** 16,
+        init_scale=2.0**16,
         growth_factor=2.0,
         backoff_factor=0.5,
         growth_interval=2000,
@@ -1501,7 +1597,7 @@ def optimizer_step(
 
     @contextmanager
     def forward_context(self) -> Generator[None, None, None]:
-        """ No explicit precision casting. Inputs are supposed to be manually casted """
+        """No explicit precision casting. Inputs are supposed to be manually casted"""
         try:
             yield
         finally:
@@ -1509,7 +1605,7 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class GlobalBatchDataFetcher(_DataFetcher):
-    """ Overrides PTL DataFetcher. Used to fetch global batches."""
+    """Overrides PTL DataFetcher. Used to fetch global batches."""
 
     def __init__(self, prefetch_batches: int = 0, store_on_device: bool = False) -> None:
 
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 63caa409b218..4d558ce00114 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -36,6 +36,8 @@
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
     LoraKQVAdapterWeightTyingConfig,
+    LoraUnfusedHto4HAdapterConfig,
+    LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
     ParallelLinearAdapterConfig,
     ParallelLinearAdapterWeightTyingConfig,
@@ -52,9 +54,16 @@
     "all": "all",
 }
 
+LORA_CONFIG_TO_MCORE_MAP = {
+    "attention_qkv": "linear_qkv",
+    "attention_dense": "linear_proj",
+    "mlp_fc1": "linear_fc1",
+    "mlp_fc2": "linear_fc2",
+}
+
 
-def get_target_modules(lora_cfg):
-    original_target_modules = lora_cfg.get("target_modules", ["attention_qkv"])
+def get_target_modules(lora_cfg, default=("attention_qkv",)):
+    original_target_modules = lora_cfg.get("target_modules", default)
     target_modules = []
 
     for module in original_target_modules:
@@ -121,6 +130,9 @@ def __init__(self, cfg):
         kv_channels = self._calculate_kv_channels(cfg)
         projection_size = kv_channels * cfg.num_attention_heads
         num_query_groups = cfg.get("num_query_groups", cfg.num_attention_heads)
+        if num_query_groups is None:
+            # Cover the case where num_query_groups is explicitly set to null
+            num_query_groups = cfg.num_attention_heads
 
         qkv_projection_size = projection_size + (2 * kv_channels * num_query_groups)
 
@@ -132,11 +144,26 @@ def __init__(self, cfg):
 
         for module in target_modules:
             if module == PEFT_MODULE_MAP["qkv_module"]:
-                adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.hidden_size, qkv_projection_size, LoraKQVAdapterConfig
-                )
-                name_key_to_cfg[AdapterName.LORA_KQV_ADAPTER] = adapter_cfg
-                name_key_to_mcore_mixins[AdapterName.LORA_KQV_ADAPTER] = [("self_attention", MCoreSelfAttentionMixin)]
+                if lora_cfg.get("variant", "nemo") == "canonical":
+                    _adapter_name = AdapterName.LORA_UNFUSED_KQV_ADAPTER
+                    _adapter_cfg_cls = LoraUnfusedKQVAdapterConfig
+                    adapter_cfg = self._create_lora_config(
+                        cfg,
+                        lora_cfg,
+                        cfg.hidden_size,
+                        qkv_projection_size,
+                        _adapter_cfg_cls,
+                        num_query_groups=num_query_groups,
+                        kv_channels=kv_channels,
+                    )
+                else:
+                    _adapter_name = AdapterName.LORA_KQV_ADAPTER
+                    _adapter_cfg_cls = LoraKQVAdapterConfig
+                    adapter_cfg = self._create_lora_config(
+                        cfg, lora_cfg, cfg.hidden_size, qkv_projection_size, _adapter_cfg_cls
+                    )
+                name_key_to_cfg[_adapter_name] = adapter_cfg
+                name_key_to_mcore_mixins[_adapter_name] = [("self_attention", MCoreSelfAttentionMixin)]
 
             elif module == PEFT_MODULE_MAP["dense_module"]:
                 adapter_cfg = self._create_lora_config(
@@ -149,11 +176,18 @@ def __init__(self, cfg):
 
             elif module == PEFT_MODULE_MAP["hto4h_module"]:
                 hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size
+                if lora_cfg.get("variant", "nemo") == "canonical":
+                    _adapter_name = AdapterName.LORA_UNFUSED_Hto4H_ADAPTER
+                    _adapter_cfg_cls = LoraUnfusedHto4HAdapterConfig
+                else:
+                    _adapter_name = AdapterName.LORA_Hto4H_ADAPTER
+                    _adapter_cfg_cls = LoraHto4HAdapterConfig
+
                 adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, LoraHto4HAdapterConfig
+                    cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, _adapter_cfg_cls
                 )
-                name_key_to_cfg[AdapterName.LORA_Hto4H_ADAPTER] = adapter_cfg
-                name_key_to_mcore_mixins[AdapterName.LORA_Hto4H_ADAPTER] = [("mlp", MCoreMLPMixin)]
+                name_key_to_cfg[_adapter_name] = adapter_cfg
+                name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)]
             elif module == PEFT_MODULE_MAP["4htoh_module"]:
                 adapter_cfg = self._create_lora_config(
                     cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig
@@ -170,7 +204,9 @@ def __init__(self, cfg):
         self.name_key_to_mcore_mixins = name_key_to_mcore_mixins
         super().__init__(lora_cfg, name_key_to_cfg)
 
-    def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls):
+    def _create_lora_config(
+        self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls, num_query_groups=None, kv_channels=None
+    ):
         config_args = {
             "in_features": in_features,
             "out_features": out_features,
@@ -187,6 +223,12 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
             "a2a_experimental": lora_cfg.get("a2a_experimental", False),
         }
 
+        if adapter_cfg_cls == LoraUnfusedKQVAdapterConfig:
+            assert num_query_groups is not None, "num_query_groups must be provided for canonical Lora"
+            assert kv_channels is not None, "kv_channels must be provided for canonical Lora"
+            config_args.update({"num_query_groups": num_query_groups, "kv_channels": kv_channels})
+            config_args.pop("out_features")
+
         if lora_cfg.weight_tying:
             position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None)
             if position_embedding_strategy is None:
@@ -216,6 +258,10 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
         return adapter_cfg
 
 
+class QLoraPEFTConfig(LoraPEFTConfig):
+    pass
+
+
 class IA3PEFTConfig(PEFTConfig):
     def __init__(self, cfg):
         mlp_infused_adapter_cfg = MLPInfusedAdapterConfig(
@@ -325,6 +371,7 @@ def __init__(self, cfg):
     "ia3": IA3PEFTConfig,
     "ptuning": PtuningPEFTConfig,
     "lora": LoraPEFTConfig,
+    "qlora": QLoraPEFTConfig,
     "selective": SelectivePEFTConfig,
     'none': None,
     None: None,
diff --git a/nemo/collections/tts/g2p/models/t5.py b/nemo/collections/tts/g2p/models/t5.py
index 25f63d8d858a..19f976081687 100644
--- a/nemo/collections/tts/g2p/models/t5.py
+++ b/nemo/collections/tts/g2p/models/t5.py
@@ -46,17 +46,23 @@ class T5G2PModel(G2PModel, Exportable):
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), TokenIndex()),
-            "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "labels": NeuralType(('B', 'T'), LabelsType()),
-        }
+        if self._input_types is None:
+            return {
+                "input_ids": NeuralType(('B', 'T'), TokenIndex()),
+                "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+                "labels": NeuralType(('B', 'T'), LabelsType()),
+            }
+        return self._input_types
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"loss": NeuralType((), LossType())}
+        if self._output_types is None:
+            return {"loss": NeuralType((), LossType())}
+        return self._output_types
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        self._input_types = None
+        self._output_types = None
         self.world_size = 1
         if trainer is not None:
             self.world_size = trainer.num_nodes * trainer.num_devices
@@ -91,7 +97,11 @@ def forward(self, input_ids, attention_mask, labels):
     # ===== Training Functions ===== #
     def training_step(self, batch, batch_idx):
         input_ids, attention_mask, labels = batch
-        train_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        train_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         self.log('train_loss', train_loss)
         return train_loss
@@ -126,7 +136,10 @@ def _setup_infer_dataloader(self, cfg) -> 'torch.utils.data.DataLoader':
 
     # Functions for inference
     @torch.no_grad()
-    def _infer(self, config: DictConfig,) -> List[int]:
+    def _infer(
+        self,
+        config: DictConfig,
+    ) -> List[int]:
         """
         Runs model inference.
 
@@ -161,7 +174,11 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, split="val"):
         input_ids, attention_mask, labels = batch
 
         # Get loss from forward step
-        val_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        val_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         # Get preds from generate function and calculate PER
         labels_str = self._tokenizer.batch_decode(
@@ -287,15 +304,8 @@ def _prepare_for_export(self, **kwargs):
         }
 
     def _export_teardown(self):
-        self._input_types = self._output_types = None
-
-    @property
-    def input_types(self):
-        return self._input_types
-
-    @property
-    def output_types(self):
-        return self._output_types
+        self._input_types = None
+        self._output_types = None
 
     def input_example(self, max_batch=1, max_dim=44):
         """
@@ -307,7 +317,11 @@ def input_example(self, max_batch=1, max_dim=44):
         sentence = "Kupil sem si bicikel in mu zamenjal stol."
         input_ids = [sentence]
         input_encoding = self._tokenizer(
-            input_ids, padding='longest', max_length=self.max_source_len, truncation=True, return_tensors='pt',
+            input_ids,
+            padding='longest',
+            max_length=self.max_source_len,
+            truncation=True,
+            return_tensors='pt',
         )
         return (input_encoding.input_ids,)
 
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 81fb7cb5cd7b..04a6d2793f88 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -118,7 +118,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # STFT loss setup
         stft_loss_log_guard = cfg.get("stft_loss_log_guard", 1.0)
         self.stft_loss_scale = cfg.get("stft_loss_scale", 0.0)
-        self.stft_loss_fn = MultiResolutionSTFTLoss(resolutions=loss_resolutions, log_guard=stft_loss_log_guard,)
+        self.stft_loss_fn = MultiResolutionSTFTLoss(
+            resolutions=loss_resolutions,
+            log_guard=stft_loss_log_guard,
+        )
 
         # Time domain loss setup
         self.time_domain_loss_scale = cfg.get("time_domain_loss_scale", 1.0)
@@ -237,7 +240,9 @@ def quantize(self, encoded: torch.Tensor, encoded_len: torch.Tensor) -> torch.Te
             "tokens": NeuralType(('B', 'C', 'T_encoded'), TokenIndex()),
             "tokens_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),
+        },
     )
     def dequantize(self, tokens: torch.Tensor, tokens_len: torch.Tensor) -> torch.Tensor:
         """Convert the discrete tokens into a continuous encoded representation.
@@ -392,8 +397,7 @@ def _process_batch(self, batch):
 
     @property
     def disc_update_prob(self) -> float:
-        """Probability of updating the discriminator.
-        """
+        """Probability of updating the discriminator."""
         return self.disc_updates_per_period / self.disc_update_period
 
     def should_update_disc(self, batch_idx) -> bool:
@@ -652,4 +656,18 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         )
         models.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_22khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_medium",
+        )
+        models.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_44khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_medium",
+        )
+        models.append(model)
+
         return models
diff --git a/nemo/collections/vision/data/megatron/data_samplers.py b/nemo/collections/vision/data/megatron/data_samplers.py
index 82fc49990c49..2f63e675731b 100644
--- a/nemo/collections/vision/data/megatron/data_samplers.py
+++ b/nemo/collections/vision/data/megatron/data_samplers.py
@@ -67,7 +67,9 @@ def __iter__(self):
             random_idx = torch.randperm(bucket_size, generator=g).tolist()
             idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
         else:
-            full_bucket_size = (self.total_samples // self.micro_batch_size) * self.micro_batch_size
+            full_bucket_size = (
+                self.total_samples // self.micro_batch_times_data_parallel_size
+            ) * self.micro_batch_times_data_parallel_size
             full_bucket_offset = current_epoch_samples
             g = torch.Generator()
             g.manual_seed(self.epoch)
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index c27c37c2b917..46788d2c882c 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -181,12 +181,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.transformer_engine = cfg.get('transformer_engine', False)
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
@@ -621,9 +625,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, drop_last=Tru
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/core/classes/common.py b/nemo/core/classes/common.py
index 34cb680db37b..73253bbd0a56 100644
--- a/nemo/core/classes/common.py
+++ b/nemo/core/classes/common.py
@@ -219,7 +219,10 @@ def _validate_input_types(self, input_types=None, ignore_collections=False, **kw
                     hasattr(value, 'neural_type')
                     and is_semantic_typecheck_enabled()
                     and not metadata.base_types[key].compare(value.neural_type)
-                    in (NeuralTypeComparisonResult.SAME, NeuralTypeComparisonResult.GREATER,)
+                    in (
+                        NeuralTypeComparisonResult.SAME,
+                        NeuralTypeComparisonResult.GREATER,
+                    )
                 ):
                     error_msg = [
                         f"{input_types[key].compare(value.neural_type)} :",
@@ -398,7 +401,10 @@ def __check_neural_type(self, obj, metadata: TypecheckMetadata, depth: int, name
             hasattr(obj, 'neural_type')
             and is_semantic_typecheck_enabled()
             and not type_val.compare(obj.neural_type)
-            in (NeuralTypeComparisonResult.SAME, NeuralTypeComparisonResult.GREATER,)
+            in (
+                NeuralTypeComparisonResult.SAME,
+                NeuralTypeComparisonResult.GREATER,
+            )
         ):
             raise TypeError(
                 f"{type_val.compare(obj.neural_type)} : \n"
@@ -711,6 +717,7 @@ def from_pretrained(
         return_config: bool = False,
         trainer: Optional['Trainer'] = None,
         save_restore_connector: SaveRestoreConnector = None,
+        return_model_file: Optional[bool] = False,
     ):
         """
         Instantiates an instance of NeMo from NVIDIA NGC cloud
@@ -726,6 +733,7 @@ def from_pretrained(
             strict: Passed to torch.load_state_dict. By default true.
             return_config: If set to true, will return just the underlying config of the restored
                 model as an OmegaConf DictConfig object without instantiating the model.
+            return_model_file: If set to true, will return just the downloaded model file in cache
 
         Returns:
             A model instance of a particular model class or its underlying config (if return_config is set).
@@ -751,6 +759,9 @@ def from_pretrained(
                 model_name=model_name, refresh_cache=refresh_cache
             )
 
+        if return_model_file:
+            return nemo_model_file_in_cache
+
         instance = class_.restore_from(
             restore_path=nemo_model_file_in_cache,
             override_config_path=override_config_path,
diff --git a/nemo/core/classes/dataset.py b/nemo/core/classes/dataset.py
index 738ae22f5416..789fc0b863d7 100644
--- a/nemo/core/classes/dataset.py
+++ b/nemo/core/classes/dataset.py
@@ -42,12 +42,15 @@ def collate_fn(self, batch):
 
         Please note, subclasses of Dataset should not implement `input_types`.
 
-        # Usage:
-        dataloader = torch.utils.data.DataLoader(
-                ....,
-                collate_fn=dataset.collate_fn,
-                ....
-        )
+        Usage:
+
+        .. code-block:: python
+
+            dataloader = torch.utils.data.DataLoader(
+                    ....,
+                    collate_fn=dataset.collate_fn,
+                    ....
+            )
 
         Returns:
             Collated batch, with or without types.
diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py
index 380ee819d5f9..a4803d368e80 100644
--- a/nemo/core/classes/exportable.py
+++ b/nemo/core/classes/exportable.py
@@ -69,7 +69,7 @@ def export(
         check_tolerance=0.01,
         export_modules_as_functions=False,
         keep_initializers_as_inputs=None,
-        use_dynamo=True,
+        use_dynamo=False,
     ):
         """
         Exports the model to the specified format. The format is inferred from the file extension of the output file.
@@ -146,7 +146,7 @@ def _export(
         check_tolerance=0.01,
         export_modules_as_functions=False,
         keep_initializers_as_inputs=None,
-        use_dynamo=True,
+        use_dynamo=False,
     ):
         my_args = locals().copy()
         my_args.pop('self')
@@ -350,7 +350,7 @@ def output_types_for_export(self):
 
     def get_export_subnet(self, subnet=None):
         """
-        Returns Exportable subnet model/module to export 
+        Returns Exportable subnet model/module to export
         """
         if subnet is None or subnet == 'self':
             return self
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index d5cd18179e8b..0a9054c23da8 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -24,6 +24,17 @@
 
 import hydra
 import torch
+
+try:
+    from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+    from megatron.core.utils import get_model_config
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.utilities import model_summary, rank_zero_only
@@ -32,7 +43,7 @@
 from nemo.core import optim
 from nemo.core.classes.common import Model
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
-from nemo.core.optim import prepare_lr_scheduler
+from nemo.core.optim import McoreDistributedOptimizer, prepare_lr_scheduler
 from nemo.utils import logging, model_utils
 from nemo.utils.app_state import AppState
 from nemo.utils.debug_hook import register_debug_hooks
@@ -192,10 +203,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.training_step = model_utils.wrap_training_step(self.training_step)
 
         # Setup nsys profiling if it has been enabled in the model config
-        self._setup_nsys_profiling()
+        self._setup_profiling()
 
         # A flag for the profile generation
-        self._profile_complete = False
+        self._nsys_profile_started = False
+        self._nsys_profile_complete = False
+        self._memory_profile_started = False
+        self._memory_profile_complete = False
 
     def __init_subclass__(cls) -> None:
         cls._save_restore_connector = SaveRestoreConnector()
@@ -570,6 +584,31 @@ def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]):
             if self._test_dl is not None and type(self._test_dl) in [list, tuple]:
                 self._test_names = ['test_{}_'.format(idx) for idx in range(len(self._test_dl))]
 
+    def setup_megatron_optimization(self, optim_config: Union[Dict[str, Any], DictConfig]):
+        """
+        Setup mcore optimizer config.
+
+        Args:
+            optim_config: Nemo optim args used to set up Mcore optimizer options.
+        """
+
+        config = get_model_config(self.model[0])
+
+        megatron_optim_config = OptimizerConfig(
+            fp16=config.fp16,
+            bf16=config.bf16,
+            params_dtype=config.params_dtype,
+            lr=optim_config['lr'],
+            weight_decay=optim_config['weight_decay'],
+            adam_beta1=optim_config['betas'][0],
+            adam_beta2=optim_config['betas'][1],
+            clip_grad=self.trainer.gradient_clip_val,
+            use_distributed_optimizer=self.use_mcore_dist_optim,
+            overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+            overlap_param_gather=self.cfg.optim.get('overlap_param_sync', False),
+        )
+        return megatron_optim_config
+
     def setup_optimization(
         self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
@@ -718,14 +757,20 @@ def setup_optimization(
                     raise e
 
         else:
-            optimizer = optim.get_optimizer(optimizer_name)
-            optimizer = optimizer(self._optimizer_param_groups, **optimizer_args)
+            if optimizer_name == 'mcore_distributed_optim':
+                # setup megatron_optim_config and get Mcore based optimizer with the wrapper
+                megatron_optim_config = self.setup_megatron_optimization(optimizer_args)
+                _megatron_optimizer = get_megatron_optimizer(megatron_optim_config, self.model,)
+                optimizer = McoreDistributedOptimizer(_megatron_optimizer)
+
+            else:
+                optimizer = optim.get_optimizer(optimizer_name)
+                optimizer = optimizer(self._optimizer_param_groups, **optimizer_args)
 
-            logging.info("Optimizer config = %s", str(optimizer))
+                logging.info("Optimizer config = %s", str(optimizer))
 
             self._optimizer = optimizer
 
-        # Try to instantiate scheduler for optimizer
         self._scheduler = prepare_lr_scheduler(
             optimizer=self._optimizer, scheduler_config=scheduler_config, train_dataloader=self._train_dl
         )
@@ -1664,7 +1709,7 @@ def update_save_restore_connector(cls, save_restore_connector):
         else:
             setattr(cls, '_save_restore_connector', save_restore_connector)
 
-    def _setup_nsys_profiling(self):
+    def _setup_profiling(self):
         """ Enables nsys profiling
             To use, add the following optoins to the model config:
             ## Nsys profiling options
@@ -1676,6 +1721,15 @@ def _setup_nsys_profiling(self):
             And then wrap the model training script with:
             nsys profile -s none -o <profile filepath>  -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
             See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling
+
+            Enables CUDA memory profiling
+            To use, add the following optoins to the model config:
+            ## CUDA memory profiling options
+            memory_profile: False
+                start_step: 10  # Global batch to start profiling
+                end_step: 10 # Global batch to end profiling
+                rank: 0 # Global rank ID to profile
+                output_path: None # Path to store the profile output file
         """
         if self.cfg.get('nsys_profile', None) is not None:
             if self.cfg.nsys_profile.get('enabled', False):
@@ -1703,6 +1757,39 @@ def _setup_nsys_profiling(self):
                 else:
                     raise ValueError(f'Nsys end_step must be greater than or equal to nsys start_step')
 
+        if self.cfg.get('memory_profile', None) is not None:
+            if self.cfg.memory_profile.get('enabled', False):
+                # CUDA memory profiling options
+                self._memory_profile_enabled = True
+                self._memory_profile_start_step = self.cfg.memory_profile.get('start_step', 0)
+                self._memory_profile_end_step = self.cfg.memory_profile.get('end_step', 0)
+                self._memory_profile_rank = self.cfg.memory_profile.get('rank', 0)
+                self._memory_profile_output_path = self.cfg.memory_profile.get('output_path', None)
+
+                if type(self._memory_profile_start_step) == int:
+                    logging.info(f'Nsys profiling setup with start_step: {self._memory_profile_start_step}')
+                else:
+                    raise ValueError(
+                        f'CUDA memory start_step must be of type int. Found: {type(self._memory_profile_start_step)}'
+                    )
+
+                if type(self._memory_profile_end_step) == int:
+                    logging.info(f'CUDA memory profiling setup with end_step: {self._memory_profile_end_step}')
+                else:
+                    raise ValueError(
+                        f'CUDA memory end_step must be of type int. Found: {type(self._memory_profile_end_step)}'
+                    )
+
+                if self._memory_profile_end_step >= self._memory_profile_start_step:
+                    pass
+                else:
+                    raise ValueError(f'CUDA memory end_step must be greater than or equal to memory start_step')
+
+                if self._memory_profile_output_path is None or not os.path.isdir(self._memory_profile_output_path):
+                    raise ValueError(
+                        f'Memory profile output path ({self._memory_profile_output_path}) is not set or does not exist.'
+                    )
+
     def on_train_start(self):
         """ PyTorch Lightning hook:
             https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start
@@ -1731,12 +1818,20 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O
         # nsys profiling
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
-                if self._nsys_profile_enabled and not self._profile_complete:
+                if self._nsys_profile_enabled and not self._nsys_profile_started:
                     if batch_idx >= self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== Start nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStart()
                         if self._nsys_profile_gen_shape:
                             torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+                        self._nsys_profile_started = True
+
+            if hasattr(self, '_memory_profile_enabled'):
+                if self._memory_profile_enabled and not self._memory_profile_started:
+                    if batch_idx >= self._memory_profile_start_step and get_rank() == self._memory_profile_rank:
+                        logging.info("====== Start CUDA memory profiling ======")
+                        torch.cuda.memory._record_memory_history(max_entries=100000)
+                        self._memory_profile_started = True
 
         # dynamic freezing
         if hasattr(self, '_freeze_cfg') and self._freeze_cfg is not None:
@@ -1768,11 +1863,21 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int =
 
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
-                if self._nsys_profile_enabled and not self._profile_complete:
+                if self._nsys_profile_enabled and not self._nsys_profile_complete:
                     if batch_idx >= self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== End nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStop()
-                        self._profile_complete = True
+                        self._nsys_profile_complete = True
+
+            if hasattr(self, '_memory_profile_enabled'):
+                if self._memory_profile_enabled and not self._memory_profile_complete:
+                    if batch_idx >= self._memory_profile_end_step and get_rank() == self._memory_profile_rank:
+                        logging.info("====== End CUDA memory profiling ======")
+                        torch.cuda.memory._dump_snapshot(
+                            f'{self._memory_profile_output_path}/memory_profile_rank{self._memory_profile_rank}.pickle'
+                        )
+                        torch.cuda.memory._record_memory_history(enabled=None)
+                        self._memory_profile_complete = True
 
     def _cleanup_on_execution_end(self):
         """
diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py
index 2d01e9d5bad8..70d91066b7f0 100644
--- a/nemo/core/connectors/save_restore_connector.py
+++ b/nemo/core/connectors/save_restore_connector.py
@@ -553,6 +553,29 @@ def _make_nemo_file_from_folder(filename, source_dir):
         with tarfile.open(filename, "w:") as tar:
             tar.add(source_dir, arcname=".")
 
+    @staticmethod
+    def _is_safe_path(member, extract_to):
+        # Check for path traversal characters or absolute paths
+        member_path = os.path.normpath(member.name)
+        # Ensure the path does not start with a slash or contain ".." after normalization
+        if os.path.isabs(member_path) or ".." in member_path.split(os.sep):
+            return False
+        # Construct the full path where the member would be extracted
+        full_path = os.path.join(extract_to, member_path)
+        # Ensure the member would be extracted within the intended directory
+        return os.path.commonprefix([full_path, extract_to]) == extract_to
+
+    @staticmethod
+    def _safe_extract(tar, out_folder: str, members=None):
+        extract_to = os.path.realpath(out_folder)
+        if members is None:
+            members = tar.getmembers()
+        for member in members:
+            if SaveRestoreConnector._is_safe_path(member, extract_to):
+                tar.extract(member, extract_to)
+            else:
+                logging.warning(f"Skipping potentially unsafe member: {member.name}")
+
     @staticmethod
     def _unpack_nemo_file(path2file: str, out_folder: str, extract_config_only: bool = False) -> str:
         if not os.path.exists(path2file):
@@ -569,10 +592,10 @@ def _unpack_nemo_file(path2file: str, out_folder: str, extract_config_only: bool
             tar_header = "r:gz"
         tar = tarfile.open(path2file, tar_header)
         if not extract_config_only:
-            tar.extractall(path=out_folder)
+            SaveRestoreConnector._safe_extract(tar, out_folder)
         else:
             members = [x for x in tar.getmembers() if ".yaml" in x.name]
-            tar.extractall(path=out_folder, members=members)
+            SaveRestoreConnector._safe_extract(tar, out_folder, members)
         tar.close()
         return out_folder
 
diff --git a/nemo/core/optim/__init__.py b/nemo/core/optim/__init__.py
index 79c4a8dc58ca..488f4f57ea58 100644
--- a/nemo/core/optim/__init__.py
+++ b/nemo/core/optim/__init__.py
@@ -28,6 +28,7 @@
     WarmupPolicy,
     prepare_lr_scheduler,
 )
+from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 from nemo.core.optim.novograd import Novograd
 from nemo.core.optim.optimizer_with_main_params import MainParamsOptimizerWrapper
 from nemo.core.optim.optimizers import get_optimizer, parse_optimizer_args, register_optimizer
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 32bd7e6c1154..77d00de89232 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -34,9 +34,62 @@
 from megatron.core.dist_checkpointing.optimizer import get_param_id_to_sharded_param_map, optim_state_to_sharding_state
 from transformer_engine.pytorch.cpp_extensions import cast_to_fp8
 
-from nemo.utils import str_to_dtype
+from nemo.utils import logging, str_to_dtype
 from nemo.utils.te_utils import is_float8tensor
 
+_distribute_within_nodes_pgs = {}
+
+
+def create_distribute_within_nodes_pgs():
+    """Create process groups for distributing with nodes.
+
+    User can reuse this function to reorder communicators for SHArP.
+    """
+    global _distribute_within_nodes_pgs
+    assert torch.distributed.is_initialized()
+    if _distribute_within_nodes_pgs:
+        return _distribute_within_nodes_pgs
+
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+    devices = torch.cuda.device_count()
+    nodes = world_size // devices
+
+    if nodes * devices != world_size:
+        logging.warning("Expected all nodes have the same amout of devices, disable distribute_within_nodes.")
+        return {}
+
+    node_id = rank // devices
+    device_id = rank % devices
+
+    distributed_pgs = []
+    for i in range(nodes):
+        ranks = [i * devices + j for j in range(devices)]
+        pg = torch.distributed.new_group(ranks=ranks)
+        distributed_pgs.append(pg)
+
+    redundant_pgs = []
+    for i in range(devices):
+        ranks = [i + j * devices for j in range(nodes)]
+        pg = torch.distributed.new_group(ranks=ranks)
+        redundant_pgs.append(pg)
+
+    # To re-order SHArP communicator right after distributed init,
+    # we have to expose redundant_process_group to user.
+    # User has too invoke allreduce through redundant_process_group
+    # before all other communicators to lock SHArP tree.
+    _distribute_within_nodes_pgs = {
+        'world_size': world_size,
+        'rank': rank,
+        'devices': devices,
+        'nodes': nodes,
+        'node_id': node_id,
+        'device_id': device_id,
+        'distributed_process_group': distributed_pgs[node_id],
+        'redundant_process_group': redundant_pgs[device_id],
+    }
+    return _distribute_within_nodes_pgs
+
 
 class MegatronDistributedFusedAdam(DistributedFusedAdam):
     """Adam optimizer with ZeRO algorithm
@@ -69,7 +122,7 @@ def __init__(
     ):
 
         # Initialize process groups
-        if 'process_group' not in kwargs and not parallel_state.is_unitialized():
+        if 'process_group' not in kwargs and parallel_state.is_initialized():
             kwargs['process_group'] = parallel_state.get_data_parallel_group(with_context_parallel=True)
         if disable_distributed_parameters:
             world_size = torch.distributed.get_world_size()
@@ -78,27 +131,12 @@ def __init__(
             kwargs['distributed_process_group'] = self_groups[rank]
             kwargs['redundant_process_group'] = kwargs['process_group']
         elif distribute_within_nodes:
-            world_size = torch.distributed.get_world_size()
-            rank = torch.distributed.get_rank()
-            devices = torch.cuda.device_count()
-            nodes = world_size // devices
-            assert nodes * devices == world_size, "Expected all nodes have teh same amout of devices."
-            node_id = rank // devices
-            device_id = rank % devices
-
-            distributed_pgs = []
-            for i in range(nodes):
-                ranks = [i * devices + j for j in range(devices)]
-                pg = torch.distributed.new_group(ranks=ranks)
-                distributed_pgs.append(pg)
-            kwargs['distributed_process_group'] = distributed_pgs[node_id]
-
-            redundant_pgs = []
-            for i in range(devices):
-                ranks = [i + j * devices for j in range(nodes)]
-                pg = torch.distributed.new_group(ranks=ranks)
-                redundant_pgs.append(pg)
-            kwargs['redundant_process_group'] = redundant_pgs[device_id]
+            dist_pg_infos = create_distribute_within_nodes_pgs()
+            if dist_pg_infos:
+                kwargs['distributed_process_group'] = dist_pg_infos['distributed_process_group']
+                kwargs['redundant_process_group'] = dist_pg_infos['redundant_process_group']
+                global _distribute_within_nodes_pgs
+                _distribute_within_nodes_pgs = {}
 
         # Make sure dtypes are in right type
         for keyword in ('dtype', 'grad_sync_dtype', 'param_sync_dtype'):
@@ -136,7 +174,8 @@ def hook(*unused):
                     self._grad_copy(param)
                     if self.overlap_grad_sync and not getattr(param, '_disable_overlap_grad_sync', False):
                         self._try_start_bucket_grad_sync(
-                            params=[param], ignore_last_bucket=need_to_initialize,
+                            params=[param],
+                            ignore_last_bucket=need_to_initialize,
                         )
 
         return hook
@@ -167,10 +206,14 @@ def init_params(
         # Initialize FP8 and non-FP8 tensors separately
         if any(is_float8tensor(param) for param in params):
             super().init_params(
-                filter(is_float8tensor, params), param_sync_dtype=torch.uint8, **kwargs,
+                filter(is_float8tensor, params),
+                param_sync_dtype=torch.uint8,
+                **kwargs,
             )
         super().init_params(
-            params, param_sync_dtype=param_sync_dtype, **kwargs,
+            params,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
 
     def init_params_bucket(
@@ -200,7 +243,10 @@ def init_params_bucket(
         params = remaining_params
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            fp32_params, grad_sync_dtype=torch.float32, param_sync_dtype=param_sync_dtype, **kwargs,
+            fp32_params,
+            grad_sync_dtype=torch.float32,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         fp32_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
@@ -216,7 +262,10 @@ def init_params_bucket(
         params = remaining_params
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            fp8_params, grad_sync_dtype=grad_sync_dtype, param_sync_dtype=torch.uint8, **kwargs,
+            fp8_params,
+            grad_sync_dtype=grad_sync_dtype,
+            param_sync_dtype=torch.uint8,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         fp8_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
@@ -225,12 +274,18 @@ def init_params_bucket(
         normal_buckets = []
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            params, grad_sync_dtype=grad_sync_dtype, param_sync_dtype=param_sync_dtype, **kwargs,
+            params,
+            grad_sync_dtype=grad_sync_dtype,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         normal_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
 
-        def add_param_to_bucket(param: torch.nn.Parameter, bucket: self.StateBucket,) -> None:
+        def add_param_to_bucket(
+            param: torch.nn.Parameter,
+            bucket: self.StateBucket,
+        ) -> None:
             """Add trivial param fragment to bucket"""
             param_fragments = self.state[param]["fragments"]
             param_group_id = param_fragments[0].param_group_id
@@ -283,7 +338,11 @@ def _init_param_state(
         # Initialize non-FP8 params as usual
         if not is_float8tensor(param):
             super()._init_param_state(
-                param, param_group_id, param_id, param_sync_dtype=param_sync_dtype, **kwargs,
+                param,
+                param_group_id,
+                param_id,
+                param_sync_dtype=param_sync_dtype,
+                **kwargs,
             )
 
         # Return immediately if already initialized
@@ -293,7 +352,11 @@ def _init_param_state(
         # Initialize with FP32 copy of param
         fp32_param = param.float()
         super()._init_param_state(
-            fp32_param, param_group_id, param_id, param_sync_dtype=torch.uint8, **kwargs,
+            fp32_param,
+            param_group_id,
+            param_id,
+            param_sync_dtype=torch.uint8,
+            **kwargs,
         )
         self.state[param].update(self.state[fp32_param])
         del self.state[fp32_param]
@@ -355,12 +418,16 @@ def init_param_buffer(self) -> None:
                         f"Attempted to change a parameter with dtype={param.dtype} "
                         f"into a buffer view with dtype={param_buffer_view.dtype}"
                     )
+                if param.is_contiguous(memory_format=torch.channels_last):
+                    param = param.permute(0, 2, 3, 1)
                 param_flat_views.append(param.detach().view(-1))
             param_buffer_views.append(param_buffer_view)
 
         # Copy values into param buffer
         _multi_tensor_copy(
-            param_flat_views, param_buffer_views, dummy_overflow_buf=self._dummy_overflow_buf,
+            param_flat_views,
+            param_buffer_views,
+            dummy_overflow_buf=self._dummy_overflow_buf,
         )
 
         # Make all params a view into the param buffer
@@ -368,7 +435,15 @@ def init_param_buffer(self) -> None:
             if is_float8tensor(param):
                 param._data = buffer_view.view(param.size())
             else:
-                param.data = buffer_view.view(param.size())
+                # Preserve memory format for param here, i.e. NHWC tensors
+                # `param.data.set_()` failed to change storage.
+                # `param.set_()` invalidates bprop hook.
+                param.data = torch.as_strided(
+                    buffer_view,
+                    param.size(),
+                    param.stride(),
+                    storage_offset=buffer_view.storage_offset(),
+                )
 
     def try_grad_sync(self, params: Iterable[torch.nn.Parameter]) -> None:
         """Attempt to launch gradient synchronization"""
@@ -393,7 +468,10 @@ def zero_grad(self, *args, **kwargs) -> None:
                     param.main_grad = self.grad_buffer_view(param)
 
     def grad_norm(
-        self, parameters: Optional[Iterable[torch.nn.Parameter]] = None, norm_type: float = 2.0, force: bool = False,
+        self,
+        parameters: Optional[Iterable[torch.nn.Parameter]] = None,
+        norm_type: float = 2.0,
+        force: bool = False,
     ) -> torch.Tensor:
         assert norm_type == 2
 
@@ -411,7 +489,8 @@ def grad_norm(
 
             # Sum over all procs to get grad norm
             torch.distributed.all_reduce(
-                grad_norm_sq, op=torch.distributed.ReduceOp.SUM,
+                grad_norm_sq,
+                op=torch.distributed.ReduceOp.SUM,
             )
             self._grad_norm = grad_norm_sq.sqrt()
 
@@ -479,7 +558,9 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
 
         # Copy data from parameter buckets to parameters
         _multi_tensor_copy(
-            buffers_in, buffers_out, dummy_overflow_buf=self._dummy_overflow_buf,
+            buffers_in,
+            buffers_out,
+            dummy_overflow_buf=self._dummy_overflow_buf,
         )
 
         # Update transpose caches
@@ -487,8 +568,6 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
         for param in params:
             if is_float8tensor(param):
                 param._reset_caches()
-                param.transpose(update_cache=True)
-                param._lazy_transpose_cache = True
 
     @torch.no_grad()
     def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedAdam.ParameterBucket]) -> None:
@@ -570,11 +649,15 @@ def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedA
             packed_scales = torch.empty(num_fp8_params, dtype=torch.float32, device=self.device)
             packed_scale_views = [packed_scales[i].view(1) for i in range(num_fp8_params)]
             _multi_tensor_copy(
-                scales, packed_scale_views, dummy_overflow_buf=self._dummy_overflow_buf,
+                scales,
+                packed_scale_views,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
             torch.reciprocal(packed_scales, out=packed_scales)
             _multi_tensor_copy(
-                packed_scale_views, scale_invs, dummy_overflow_buf=self._dummy_overflow_buf,
+                packed_scale_views,
+                scale_invs,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
 
             # Reduce amaxes
@@ -582,13 +665,19 @@ def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedA
             packed_amaxes = torch.empty(num_fp8_params, dtype=torch.float32, device=self.device)
             packed_amax_views = [packed_amaxes[i].view(1) for i in range(num_fp8_params)]
             _multi_tensor_copy(
-                amaxes, packed_amax_views, dummy_overflow_buf=self._dummy_overflow_buf,
+                amaxes,
+                packed_amax_views,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
             torch.distributed.all_reduce(
-                packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=self.distributed_process_group,
+                packed_amaxes,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.distributed_process_group,
             )
             _multi_tensor_copy(
-                packed_amax_views, amaxes, dummy_overflow_buf=self._dummy_overflow_buf,
+                packed_amax_views,
+                amaxes,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
 
             # Reset
@@ -602,7 +691,8 @@ def sharded_state_dict(self, model_sharded_state_dict, optimizer_state_dict=None
             optimizer_state_dict = self.state_dict()
 
         id_to_sharded_param_map = get_param_id_to_sharded_param_map(
-            model_sharded_state_dict=model_sharded_state_dict, optim_params_iter=self.parameters(),
+            model_sharded_state_dict=model_sharded_state_dict,
+            optim_params_iter=self.parameters(),
         )
         # Convert state
         step = optimizer_state_dict['state'].pop('step')
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
new file mode 100644
index 000000000000..234680f49249
--- /dev/null
+++ b/nemo/core/optim/mcore_optim.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+try:
+    from megatron.core.optimizer.optimizer import MegatronOptimizer
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+class McoreDistributedOptimizer(torch.optim.Optimizer):
+    """
+    A wrapper for Mcore distributed optimizer.
+
+    Arguments:
+        optim: distributed optimizer from Megatron core.
+    """
+
+    def __init__(self, optim):
+        self.defaults = {}
+        self.mcore_optimizer = optim
+        self.param_groups = self.mcore_optimizer.param_groups
+        self.state = self.mcore_optimizer.state
+
+    def zero_grad(self, set_to_none: bool = True):
+        """We only need to zero the model related parameters, i.e.,
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
+        self.mcore_optimizer.zero_grad(set_to_none)
+
+    def reload_model_params(self):
+        self.mcore_optimizer.reload_model_params()
+
+    def state_dict(self):
+        return self.mcore_optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self.mcore_optimizer.load_state_dict(state_dict)
+
+    def sharded_state_dict(
+        self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False
+    ):
+        # TODO(@akoumparouli, @mikolajblaz): switch to sharding_type once support for fully_sharded_model_space merged in mcore.
+        # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
+        sharding_type = 'dp_zero_gather_scatter'
+        return self.mcore_optimizer.sharded_state_dict(
+            model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
+        )
+
+    def step(self, closure):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+        # Apply closure
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        # return unused update_successful, grad_norm, num_zeros_in_grad
+        self.mcore_optimizer.step()
+
+        return loss
+
+    def save_parameter_state(self, filename: str):
+        self.mcore_optimizer.save_parameter_state(filename)
+
+    def load_parameter_state(self, filename: str):
+        self.mcore_optimizer.load_parameter_state(filename)
+
+    def finish_param_sync(self, model_index):
+        self.mcore_optimizer.finish_param_sync(model_index)
+
+    def disable_pre_hook(self):
+        self.mcore_optimizer.disable_pre_hook()
+
+    def enable_pre_hook(self):
+        self.mcore_optimizer.enable_pre_hook()
diff --git a/nemo/core/utils/cuda_python_utils.py b/nemo/core/utils/cuda_python_utils.py
index fb47c22ceee0..eb8897df0797 100644
--- a/nemo/core/utils/cuda_python_utils.py
+++ b/nemo/core/utils/cuda_python_utils.py
@@ -25,7 +25,7 @@ def check_cuda_python_cuda_graphs_conditional_nodes_supported():
     try:
         from cuda import cuda
     except ImportError:
-        raise ModuleNotFoundError("Please do `pip install cuda-python>=12.3`")
+        raise ModuleNotFoundError("No `cuda-python` module. Please do `pip install cuda-python>=12.3`")
 
     from cuda import __version__ as cuda_python_version
 
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
index 22dea8ac47cd..25e09cf3eacc 100644
--- a/nemo/deploy/deploy_pytriton.py
+++ b/nemo/deploy/deploy_pytriton.py
@@ -24,7 +24,6 @@
 
 
 class DeployPyTriton(DeployBase):
-
     """
     Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
 
@@ -102,7 +101,6 @@ def __init__(
         )
 
     def deploy(self):
-
         """
         Deploys any models to Triton Inference Server.
         """
@@ -148,7 +146,6 @@ def deploy(self):
             print(e)
 
     def serve(self):
-
         """
         Starts serving the model and waits for the requests
         """
@@ -163,7 +160,6 @@ def serve(self):
             print(e)
 
     def run(self):
-
         """
         Starts serving the model asynchronously.
         """
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 6a4337024eeb..c8387914c2e9 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase):
 
     def __init__(self, url, model_name):
         super().__init__(
-            url=url, model_name=model_name,
+            url=url,
+            model_name=model_name,
         )
 
     def query_llm(
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 2663f8fe9bac..4748f4957a52 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -12,40 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import tarfile
 from contextlib import nullcontext
 from typing import List, Optional
 
 import torch
 import torch.distributed as dist
-from megatron.core import parallel_state
+from megatron.core import mpu, parallel_state
 from megatron.core.transformer.module import Float16Module
-from megatron.training.utils import unwrap_model
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import DictConfig, open_dict
 from pytorch_lightning.trainer.trainer import Trainer
+from tqdm import tqdm
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 from nemo.utils.distributed import temporary_directory
-from nemo.utils.model_utils import load_config, save_artifacts
+from nemo.utils.model_utils import load_config, save_artifacts, unwrap_model
 
 try:
-    import ammo.torch.quantization as atq
-    from ammo.torch.export import export_tensorrt_llm_checkpoint
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.export import export_tensorrt_llm_checkpoint
+    from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
 
-    HAVE_AMMO = True
+    HAVE_MODELOPT = True
 
 except (ImportError, ModuleNotFoundError) as e:
-    HAVE_AMMO = False
-    HAVE_AMMO_ERROR = e
+    HAVE_MODELOPT = False
+    HAVE_MODELOPT_ERROR = e
 
 
 class Quantizer:
-
     """
     Post-training quantization of Nemo checkpoints.
 
@@ -65,9 +64,9 @@ class Quantizer:
     model families is experimental and might not be fully supported.
 
     Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below.
-    Please consult AMMO documentation for details. You can also inspect different choices in
-    examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
-    calibration data as well as recommended settings.
+    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
+    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_quantization.yaml
+    for quantization algorithms and calibration data as well as recommended settings.
 
     Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
     for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
@@ -80,14 +79,14 @@ def __init__(
         export_config: DictConfig,
         trainer_config: DictConfig,
     ):
-        if not HAVE_AMMO:
-            raise RuntimeError("nvidia-ammo is needed to use Quantizer") from HAVE_AMMO_ERROR
+        if not HAVE_MODELOPT:
+            raise RuntimeError("nvidia-modelopt is needed to use Quantizer") from HAVE_MODELOPT_ERROR
         QUANT_CFG_CHOICES = {
-            "int8": atq.INT8_DEFAULT_CFG,
-            "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
-            "fp8": atq.FP8_DEFAULT_CFG,
-            "int4_awq": atq.INT4_AWQ_CFG,
-            "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+            "int8": mtq.INT8_DEFAULT_CFG,
+            "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+            "fp8": mtq.FP8_DEFAULT_CFG,
+            "int4_awq": mtq.INT4_AWQ_CFG,
+            "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
         }
         SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
         assert export_config.dtype in SUPPORTED_DTYPE
@@ -97,25 +96,30 @@ def __init__(
         self.export_config = export_config
         self.trainer_config = trainer_config
         if quantization_config.algorithm is not None:
-            atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
+            quant_cfg = QUANT_CFG_CHOICES[quantization_config.algorithm]
 
             if "awq" in quantization_config.algorithm:
-                weight_quantizer = atq_config["quant_cfg"]["*weight_quantizer"]
+                weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
                 if isinstance(weight_quantizer, list):
                     weight_quantizer = weight_quantizer[0]
                 weight_quantizer["block_sizes"][-1] = quantization_config.awq_block_size
 
             # Always turn on FP8 kv cache to save memory footprint.
             # For int8_sq, we use int8 kv cache.
-            atq_config["quant_cfg"]["*output_quantizer"] = {
+            # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
+            enable_quant_kv_cache = (
+                "int8" not in quantization_config.algorithm and export_config.decoder_type != "gptnext"
+            )
+            print(f'{"Enable" if enable_quant_kv_cache else "Disable"} KV cache quantization')
+            quant_cfg["quant_cfg"]["*output_quantizer"] = {
                 "num_bits": 8 if quantization_config.algorithm == "int8_sq" else (4, 3),
                 "axis": None,
-                "enable": export_config.decoder_type != "gptnext",
+                "enable": enable_quant_kv_cache,
             }
 
-            self.atq_config = atq_config
+            self.quant_cfg = quant_cfg
         else:
-            self.atq_config = None
+            self.quant_cfg = None
 
     def _load_model(
         self,
@@ -123,14 +127,17 @@ def _load_model(
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
-        """Load model using AMMO layer spec for quantization."""
+        """Load model using ModelOpt layer spec for quantization."""
         model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
 
         trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
         connector = NLPSaveRestoreConnector()
 
         model = MegatronGPTModel.restore_from(
-            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            save_restore_connector=connector,
         )
         model.freeze()
 
@@ -146,7 +153,8 @@ def _load_model(
 
         return model
 
-    def _check_ddp_initialized(self, model):
+    @staticmethod
+    def _check_ddp_initialized(model):
         if not parallel_state.is_initialized():
 
             def dummy():
@@ -156,8 +164,11 @@ def dummy():
                 model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
             model.trainer.strategy.setup_environment()
 
+        set_data_parallel_group(mpu.get_data_parallel_group())
+        set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
+
+    @staticmethod
     def _load_and_modify_config(
-        self,
         model_file: str,
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
@@ -172,12 +183,35 @@ def _load_and_modify_config(
                 model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
             if pipeline_model_parallel_size is not None:
                 model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-            # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
+            # Only custom ModelOpt spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
             # layer definitions to avoid Transformer Engine implementations that are currently not supported.
-            model_cfg.name = "ammo"
+            # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
+            # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
+            model_cfg.name = "modelopt"
+            model_cfg.apply_rope_fusion = False
 
         return model_cfg
 
+    @staticmethod
+    def _sample_output(model):
+        """Generate sample output for a model instance."""
+        if torch.distributed.get_rank() == 0:
+            print("Generating sample output for a model...")
+
+        response = model.generate(
+            inputs=[
+                "Born in north-east France, Soyer trained as a",
+                "Born in California, Soyer trained as a",
+            ],
+            length_params={
+                "max_length": 100,
+                "min_length": 100,
+            },
+        )
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Example NeMo output after PTQ: {response["sentences"]}"')
+
     def quantize(
         self,
         model_file: str,
@@ -193,13 +227,12 @@ def quantize(
 
         model.set_inference_config(OmegaConf.to_container(self.inference_config))
 
-        def forward_loop():
-            for i, batch in enumerate(dataloader):
-                if dist.get_rank() == 0:
-                    print(f"Calibrating batch {i}")
+        def forward_loop(model):
+            print("Calibrating the model...")
+            for i, batch in enumerate(tqdm(dataloader)):
                 model.predict_step(batch, i)
 
-        model = atq.quantize(model, self.atq_config, forward_loop)
+        model = mtq.quantize(model, self.quant_cfg, forward_loop)
 
         if self.export_config == "gptnext":
             # We found squared_relu may have an under-calibration problem.
@@ -209,12 +242,12 @@ def forward_loop():
                 maxbound = 448
             elif self.quantization_config.quantization.algorithm == "int8_sq":
                 maxbound = 127
-            model = atq.postprocess_amax(
+            model = mtq.postprocess_amax(
                 model, "*input_quantizer", lambda amax: torch.clamp(amax, min=0.01 * maxbound)
             )
 
         if dist.get_rank() == 0:
-            atq.print_quant_summary(model)
+            mtq.print_quant_summary(model)
 
         return model
 
@@ -222,6 +255,8 @@ def export(self, model, model_save: str):
         """Export model to '.qnemo' format for TensorRT-LLM engine build."""
         torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
+        self._sample_output(model)
+
         if model.cfg.megatron_amp_O2:
             model.model = unwrap_model(model.model, Float16Module)
 
@@ -241,6 +276,8 @@ def export(self, model, model_save: str):
                 export_dir=export_dir,
                 inference_tensor_parallel=self.export_config.inference_tensor_parallel,
                 inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
+                use_nfs_workspace=self.export_config.inference_pipeline_parallel == 1
+                and model.cfg.pipeline_model_parallel_size > 1,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if dist.get_rank() == 0:
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 40fb93816a33..7705f6553210 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -28,13 +28,12 @@
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
-from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
-from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config
+from nemo.export.trt_llm.converter.model_converter import model_to_trtllm_ckpt
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import get_tokenzier, is_nemo_file, load_nemo_model
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
-from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
-from nemo.export.trt_llm.utils import is_nemo_file
+from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
+from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load
 
 use_deploy = True
 try:
@@ -81,15 +80,24 @@ class TensorRTLLM(ITritonDeployable):
 
     """
 
-    def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
+    def __init__(
+        self,
+        model_dir: str,
+        lora_ckpt_list: List[str] = None,
+        load_model: bool = True,
+        use_python_runtime: bool = True,
+    ):
         """
         Args:
             model_dir (str): path for storing the TensorRT-LLM model files.
+            lora_ckpt_list (List[str]): lora checkpoint paths.
             load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+            use_python_runtime (bool): whether to use python or c++ runtime.
         """
 
         self.model_dir = model_dir
         self.lora_ckpt_list = lora_ckpt_list
+        self.use_python_runtime = use_python_runtime
         self.model = None
         self.tokenizer = None
         self.n_gpus = None
@@ -97,6 +105,7 @@ def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model:
         self.ptuning_tables = []
         self.p_table = None
         self.task_vocab_size = 0
+        self.task_vtoken_counts = []
         self.task_ids = {}
 
         if load_model:
@@ -114,15 +123,17 @@ def export(
         max_output_token: int = 256,
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
-        use_inflight_batching: bool = False,
-        enable_context_fmha: bool = True,
-        paged_kv_cache: bool = False,
+        use_parallel_embedding: bool = False,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
         dtype: str = "bfloat16",
         load_model: bool = True,
         enable_multi_block_mode: bool = False,
         use_lora_plugin: str = None,
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
+        max_num_tokens: int = None,
+        opt_num_tokens: int = None,
         save_nemo_model_config: bool = False,
     ):
         """
@@ -139,12 +150,18 @@ def export(
             max_output_token (int): max output length.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+            remove_input_padding (bool): enables removing input padding or not.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
             load_model (bool): load TensorRT-LLM model after the export.
             enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
+            use_lora_plugin (str): use dynamic lora or not.
+            lora_target_modules (List[str]): list of the target lora modules.
+            max_lora_rank (int): maximum lora rank.
+            max_num_tokens (int):
+            opt_num_tokens (int):
+            save_nemo_model_config (bool):
         """
 
         if model_type not in self.get_supported_models_list:
@@ -187,129 +204,79 @@ def export(
 
         self.model = None
 
-        tmp_dir = tempfile.TemporaryDirectory()
-        nemo_export_dir = Path(tmp_dir.name)
+        if tensorrt_llm.mpi_rank() == 0:
+            tmp_dir = tempfile.TemporaryDirectory()
+            nemo_export_dir = Path(tmp_dir.name)
 
-        if nemo_checkpoint_path.endswith("qnemo"):
-            if os.path.isdir(nemo_checkpoint_path):
-                nemo_export_dir = nemo_checkpoint_path
+            if nemo_checkpoint_path.endswith("qnemo"):
+                if os.path.isdir(nemo_checkpoint_path):
+                    nemo_export_dir = nemo_checkpoint_path
+                else:
+                    unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
+                    nemo_checkpoint_path = tmp_dir.name
+                self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
+
+                qnemo_to_tensorrt_llm(
+                    nemo_checkpoint_path=nemo_checkpoint_path,
+                    engine_dir=self.model_dir,
+                    max_input_len=max_input_token,
+                    max_output_len=max_output_token,
+                    max_batch_size=max_batch_size,
+                    max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                    lora_target_modules=lora_target_modules,
+                )
             else:
-                unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
-                nemo_checkpoint_path = tmp_dir.name
-            self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
-
-            qnemo_to_tensorrt_llm(
-                nemo_checkpoint_path=nemo_checkpoint_path,
-                engine_dir=self.model_dir,
-                max_input_len=max_input_token,
-                max_output_len=max_output_token,
-                max_batch_size=max_batch_size,
-                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                lora_target_modules=lora_target_modules,
-            )
-        else:
-            model_configs, self.tokenizer = nemo_llm_to_model_config(
-                in_file=nemo_checkpoint_path,
-                decoder_type=model_type,
-                dtype=dtype,
-                tensor_parallel_size=tensor_parallel_size,
-                pipeline_parallel_size=pipeline_parallel_size,
-                nemo_export_dir=nemo_export_dir,
-                save_nemo_model_config=save_nemo_model_config,
-            )
+                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                weights_dicts, model_configs = model_to_trtllm_ckpt(
+                    model=model,
+                    nemo_model_config=model_configs,
+                    nemo_export_dir=nemo_export_dir,
+                    decoder_type=model_type,
+                    dtype=dtype,
+                    tensor_parallel_size=tensor_parallel_size,
+                    pipeline_parallel_size=pipeline_parallel_size,
+                    use_parallel_embedding=use_parallel_embedding,
+                )
 
-            model_config_to_tensorrt_llm(
-                model_configs,
-                self.model_dir,
-                world_size=tensor_parallel_size * pipeline_parallel_size,
-                max_input_len=max_input_token,
-                max_output_len=max_output_token,
-                max_batch_size=max_batch_size,
-                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                use_inflight_batching=use_inflight_batching,
-                paged_kv_cache=paged_kv_cache,
-                enable_context_fmha=enable_context_fmha,
-                enable_multi_block_mode=enable_multi_block_mode,
-                use_lora_plugin=use_lora_plugin,
-                lora_target_modules=lora_target_modules,
-                max_lora_rank=max_lora_rank,
-            )
+                for weight_dict, model_config in zip(weights_dicts, model_configs):
+                    build_and_save_engine(
+                        max_input_len=max_input_token,
+                        max_output_len=max_output_token,
+                        max_batch_size=max_batch_size,
+                        model_config=model_config,
+                        model_weights=weight_dict,
+                        model_dir=self.model_dir,
+                        model_type=model_type,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_lora_plugin=use_lora_plugin,
+                        max_lora_rank=max_lora_rank,
+                        lora_target_modules=lora_target_modules,
+                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                        enable_multi_block_mode=enable_multi_block_mode,
+                        paged_kv_cache=paged_kv_cache,
+                        remove_input_padding=remove_input_padding,
+                        max_num_tokens=max_num_tokens,
+                        opt_num_tokens=opt_num_tokens,
+                    )
 
-        tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
-        if os.path.exists(tokenizer_path):
-            shutil.copy(tokenizer_path, self.model_dir)
-        else:
-            self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            if os.path.exists(tokenizer_path):
+                shutil.copy(tokenizer_path, self.model_dir)
+            else:
+                self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+
+            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+            if os.path.exists(nemo_model_config):
+                shutil.copy(nemo_model_config, self.model_dir)
 
-        nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
-        if os.path.exists(nemo_model_config):
-            shutil.copy(nemo_model_config, self.model_dir)
+            tmp_dir.cleanup()
 
-        tmp_dir.cleanup()
+        if tensorrt_llm.mpi_world_size() > 1:
+            tensorrt_llm.mpi_barrier()
 
         if load_model:
             self._load()
 
-    def build(
-        self,
-        nemo_model,
-        nemo_model_config,
-        tokenizer=None,
-        max_input_token: int = 256,
-        max_output_token: int = 256,
-        max_batch_size: int = 8,
-        use_refit: bool = False,
-        model_type: str = "gptnext",
-    ):
-        from megatron.core import parallel_state
-
-        self.use_refit = use_refit
-        self.stream = torch.cuda.Stream()
-        self.model_type = model_type
-        self.tokenizer = build_tokenizer(tokenizer)
-
-        # Each model shard has its own directory
-        if parallel_state.get_data_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"dp{parallel_state.get_data_parallel_rank()}")
-        if parallel_state.get_tensor_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"tp{parallel_state.get_tensor_model_parallel_rank()}")
-        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"pp{parallel_state.get_pipeline_model_parallel_rank()}")
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model, decoder_type=model_type, nemo_model_config=nemo_model_config,
-        )
-
-        model_config_to_tensorrt_llm(
-            model_configs,
-            self.model_dir,
-            max_input_len=max_input_token,
-            max_output_len=max_output_token,
-            max_batch_size=max_batch_size,
-            max_beam_width=1,
-            max_prompt_embedding_table_size=0,
-            use_refit=self.use_refit,
-        )
-        # Use load_refit to handle multiprocessed environment
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
-    def refit(
-        self, nemo_model, nemo_model_config,
-    ):
-        assert self.use_refit, "TRT-LLM model must be built() with refit=True"
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model, decoder_type=self.model_type, nemo_model_config=nemo_model_config
-        )
-
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
     def forward(
         self,
         input_texts: List[str],
@@ -328,7 +295,6 @@ def forward(
         output_log_probs: bool = False,
         **sampling_kwargs,
     ):
-
         """
         Exports nemo checkpoints to TensorRT-LLM.
 
@@ -358,12 +324,15 @@ def forward(
                     prompt_embeddings_table, prompt_embeddings_checkpoint_path
                 )
                 tv_size = prompt_table.size(dim=0)
+                task_vtoken_counts = [tv_size]
             elif len(self.ptuning_tables) > 0:
                 prompt_table = self.p_table
                 tv_size = self.task_vocab_size
+                task_vtoken_counts = self.task_vtoken_counts
             else:
                 prompt_table = None
                 tv_size = None
+                task_vtoken_counts = None
 
             if task_ids is None:
                 assert prompt_table is None, "There is a prompt embedding table and task_ids cannot be None"
@@ -390,7 +359,7 @@ def forward(
                             ), "Task: {0} doesn't exist in the task list.".format(task_ids[i])
                             input_task_ids.append(self.task_ids[task_ids[i]])
             if not streaming:
-                if torch.distributed.is_initialized():
+                if torch.distributed.is_initialized() or tensorrt_llm.mpi_world_size() > 1:
                     multiprocessed_env = True
                 else:
                     multiprocessed_env = False
@@ -404,6 +373,7 @@ def forward(
                     temperature=temperature,
                     prompt_table=prompt_table,
                     task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
                     task_ids=input_task_ids,
                     lora_uids=lora_uids,
                     stop_words_list=stop_words_list,
@@ -423,6 +393,7 @@ def forward(
                     temperature=temperature,
                     prompt_table=prompt_table,
                     task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
                     task_ids=input_task_ids,
                     lora_uids=lora_uids,
                     stop_words_list=stop_words_list,
@@ -472,7 +443,7 @@ def get_hidden_size(self):
         if self.config is None:
             return None
         else:
-            return self.config["builder_config"]["hidden_size"]
+            return self.config["pretrained_config"]["hidden_size"]
 
     @property
     def get_triton_input(self):
@@ -578,19 +549,31 @@ def _prep_ptuning_table(self):
             if self.task_vocab_size < pt["table"].size(dim=0):
                 self.task_vocab_size = pt["table"].size(dim=0)
 
-        # pad tasks to longest task embedding table
+        # pad tasks to longest task embedding table, remember the original task vtoken counts
         vtokens_embeddings = []
+        self.task_vtoken_counts = []
         self.task_ids = {}
         tid = 0
         for i, ptuning_table in enumerate(self.ptuning_tables):
-            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size))
-            padded_table[: ptuning_table["table"].size(dim=0), :] = ptuning_table["table"]
+            original_table = ptuning_table["table"]
+            vtoken_count = original_table.size(dim=0)
+            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size), dtype=original_table.dtype)
+            padded_table[:vtoken_count, :] = original_table
             vtokens_embeddings.append(padded_table)
             self.task_ids[ptuning_table["task_name"]] = tid
+            self.task_vtoken_counts.append(vtoken_count)
             tid = tid + 1
 
         if len(vtokens_embeddings) > 0:
             self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
+
+            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
+            actual_prompt_table_size = self.p_table.shape[0]
+
+            if actual_prompt_table_size > max_prompt_embedding_table_size:
+                raise Exception(
+                    f"The size of the combined prompt embedding table ({actual_prompt_table_size}) is greater than max_prompt_embedding_table_size ({max_prompt_embedding_table_size})."
+                )
         else:
             self.p_table = None
 
@@ -647,7 +630,9 @@ def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path):
             return weights.cpu().detach()
 
     def _get_prompt_embedding_table(
-        self, prompt_embeddings_table=None, prompt_embeddings_checkpoint_path=None,
+        self,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path=None,
     ):
         if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None:
             LOGGER.warning(
@@ -676,15 +661,15 @@ def _get_prompt_embedding_table(
                 raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.")
             prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path)
 
-        dtype = self.config['builder_config']['precision']
+        dtype = self.config['pretrained_config']['dtype']
         prompt_embeddings_table = prompt_embeddings_table.to(
             dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
         ).cuda()
 
-        if prompt_embeddings_table.size(dim=1) != self.config["builder_config"]["hidden_size"]:
+        if prompt_embeddings_table.size(dim=1) != self.config["pretrained_config"]["hidden_size"]:
             raise Exception(
                 "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format(
-                    self.config["builder_config"]["hidden_size"]
+                    self.config["pretrained_config"]["hidden_size"]
                 )
             )
 
@@ -713,7 +698,10 @@ def _load(self):
                     self._load_config_file()
                     self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
                     self.model = load(
-                        tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
+                        tokenizer=self.tokenizer,
+                        engine_dir=self.model_dir,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_python_runtime=self.use_python_runtime,
                     )
                     self._load_prompt_tables()
                 except Exception as error:
diff --git a/nemo/export/trt_llm/converter/__init__.py b/nemo/export/trt_llm/converter/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/export/trt_llm/converter/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
new file mode 100644
index 000000000000..5e522d8bbff2
--- /dev/null
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import csv
+import logging
+from typing import Dict, List, Tuple
+
+import numpy as np
+import tensorrt_llm
+from tensorrt_llm._utils import pad_vocab_size
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+
+from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import convert_model_to_trt_llm_ckpt
+from nemo.export.trt_llm.converter.utils import DECODER_MODEL_TYPE, split
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def prompt_convert(prompt_config, prompt_weights):
+    if "task_templates" in prompt_config:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        vtokens_embeddings = []
+        vtokens_len = []
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
+            )
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
+
+        max_vtoken_len = max(vtokens_len)
+        embedding_dim = vtokens_embeddings[0].shape[1]
+
+        # pad tasks to longest task embedding table
+        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
+            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
+            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
+            vtokens_embeddings[i] = padded_table
+
+        vtokens_embeddings = torch.stack(vtokens_embeddings)
+    else:
+        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
+
+    return vtokens_embeddings
+
+
+def model_to_trtllm_ckpt(
+    model,
+    nemo_model_config,
+    nemo_export_dir,
+    decoder_type: str,
+    dtype: str = "bfloat16",
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    use_parallel_embedding: bool = False,
+) -> Tuple[List[Dict], List[PretrainedConfig]]:
+
+    weights_dict = convert_model_to_trt_llm_ckpt(
+        model=model,
+        nemo_model_config=nemo_model_config,
+        nemo_export_dir=nemo_export_dir,
+        inference_tp_size=tensor_parallel_size,
+        processes=1,
+        storage_type=dtype,
+        use_parallel_embedding=use_parallel_embedding,
+        decoder_type=decoder_type,
+    )
+
+    world_size = tensor_parallel_size * pipeline_parallel_size
+
+    lm_head_weight = weights_dict["lm_head.weight"]
+
+    vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
+    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size)
+
+    if vocab_size_padded != vocab_size:
+        pad_width = vocab_size_padded - vocab_size
+        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
+
+    hidden_act = nemo_model_config.get('activation')
+    hidden_act = (
+        hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) else non_gated_version(hidden_act)
+    )
+
+    config = {
+        'architecture': DECODER_MODEL_TYPE[decoder_type],
+        'dtype': dtype,
+        'num_hidden_layers': nemo_model_config.get('num_layers'),
+        'num_attention_heads': nemo_model_config.get('num_attention_heads'),
+        'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+        'head_size': nemo_model_config.get('kv_channels'),
+        'hidden_size': nemo_model_config.get('hidden_size'),
+        'intermediate_size': nemo_model_config.get('ffn_hidden_size'),
+        'norm_epsilon': nemo_model_config.get('layernorm_epsilon'),
+        'vocab_size': vocab_size_padded,
+        'position_embedding_type': (
+            "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute"
+        ),
+        'max_position_embeddings': nemo_model_config.get('max_position_embeddings'),
+        'hidden_act': hidden_act,
+        'use_parallel_embedding': use_parallel_embedding,
+        'embedding_sharding_dim': 0,
+        'share_embedding_table': False,
+        'quantization': {
+            'quant_algo': None,
+            'kv_cache_quant_algo': None,
+        },
+        'bias': nemo_model_config.get('bias'),
+        'apply_query_key_layer_scaling': False,
+        'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0),
+        'rotary_base': nemo_model_config.get('rotary_base', 10000),
+        'moe_num_experts': nemo_model_config.get('num_moe_experts', 0),
+        'moe_top_k': nemo_model_config.get('moe_router_topk'),
+        'moe_normalization_mode': nemo_model_config.get(
+            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+        ),
+        'moe_tp_mode': nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL),
+        'logits_dtype': 'float32',
+        'world_size': world_size,
+        'tp_size': tensor_parallel_size,
+        'pp_size': pipeline_parallel_size,
+    }
+
+    model_configs = []
+    weights_dicts = []
+    num_layers = nemo_model_config.get('num_layers')
+    rotary_scaling = nemo_model_config.get("seq_len_interpolation_factor")
+
+    if decoder_type == "falcon":
+        config["new_decoder_architecture"] = False if num_layers == 32 else True
+        config["parallel_attention"] = True
+    if rotary_scaling is not None:
+        config["rotary_scaling"] = {"type": "linear", "factor": float(rotary_scaling)}
+
+    pp_key = {
+        "transformer.vocab_embedding.weight",
+        "transformer.position_embedding.weight",
+        "lm_head.weight",
+        "transformer.ln_f.weight",
+        "transformer.ln_f.bias",
+    }
+
+    for i in range(world_size):
+        mapping = tensorrt_llm.Mapping(
+            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
+        )
+        layers_range = mapping.pp_layers(num_layers)
+
+        weights_dict_local = {}
+        for k, v in weights_dict.items():
+            if k in pp_key:
+                continue
+            new_key = k
+            if new_key.endswith(".bin"):  # TP split
+                if new_key.endswith(f"{mapping.tp_rank}.bin"):
+                    new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
+            if "layers" in new_key:  # PP
+                layer_num = int(new_key.split(".")[2])
+                if layer_num in layers_range:
+                    new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}")
+            if config.get("new_decoder_architecture", False) and "post_layernorm" in new_key:
+                new_key = new_key.replace("post_layernorm", "mlp_layernorm")
+            weights_dict_local[new_key] = v
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                np.ascontiguousarray(
+                    split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
+                )
+                if use_parallel_embedding
+                else weights_dict["transformer.vocab_embedding.weight"]
+            )
+
+            weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight
+
+            pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
+            if pos_embedding_weight is not None:
+                if use_parallel_embedding:
+                    pos_embedding_weight = np.ascontiguousarray(
+                        split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
+                    )
+                weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
+
+        if mapping.is_last_pp_rank():
+            weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
+                split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
+            )
+            weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
+
+            ln_f_bias = weights_dict.get("transformer.ln_f.bias")
+            if ln_f_bias is not None:
+                weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
+
+        model_config = PretrainedConfig(**config)
+        model_config.mapping = mapping
+        model_configs.append(model_config)
+        weights_dicts.append(weights_dict_local)
+
+    return weights_dicts, model_configs
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
new file mode 100644
index 000000000000..df7e43548a44
--- /dev/null
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import math
+import multiprocessing
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import torch
+from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_torch, torch_to_numpy
+from tqdm import tqdm
+
+from nemo.export.trt_llm.converter.utils import split_and_save_weight
+
+LOGGER = logging.getLogger("NeMo")
+
+layer_names = {
+    "position_embedding": "embedding.position_embeddings.weight",
+    "word_embedding": "embedding.word_embeddings.weight",
+    "output_layer": "output_layer.weight",
+    "final_layernorm.weight": "final_layernorm.weight",
+    "final_layernorm.bias": "final_layernorm.bias",
+}
+
+
+def extract_layers_with_prefix(model_, prefix):
+    length_to_trim = len(prefix)
+    model_state = model_.get("state_dict", model_)
+    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if prefix in key}
+
+
+def get_layer_name(layer_type: str, prefix: str):
+    layer_dict = layer_names
+    if layer_type in layer_dict:
+        return prefix + layer_dict[layer_type]
+    else:
+        raise ValueError(f"Unknown layer type {layer_type}")
+
+
+def get_layer_prefix(layer_names, is_mcore):
+    transformer_layer_prefix = None
+
+    for layer_name in layer_names:
+        if 'self_attention' in layer_name:
+            transformer_layer_prefix = layer_name.split('layers')[0]
+            break
+    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
+    if is_mcore:
+        model_prefix = transformer_layer_prefix.split('decoder')[0]
+    else:
+        model_prefix = transformer_layer_prefix.split('encoder')[0]
+    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
+
+    return model_prefix, transformer_layer_prefix
+
+
+def rename_key_dist_ckpt(old_key: str, layer: int):
+    new_key = old_key
+
+    if "layers." in old_key:
+        split_key = old_key.split(".")
+        split_key.insert(1, str(layer))
+        new_key = ".".join(split_key)
+
+        if "self_attention" in new_key:
+            new_key = new_key.replace("self_attention", "attention")
+        if "attention.linear_qkv.layer_norm_weight" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
+        if "attention.linear_qkv.layer_norm_bias" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
+        if "mlp.linear_fc1.layer_norm_weight" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
+        if "mlp.linear_fc1.layer_norm_bias" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
+
+    return new_key
+
+
+@torch.no_grad()
+def convert_model_to_trt_llm_ckpt(
+    nemo_model_config,
+    model,
+    nemo_export_dir,
+    storage_type,
+    inference_tp_size,
+    decoder_type,
+    use_parallel_embedding,
+    processes,
+):
+
+    # if checkpoints files could be found - start preparing output dir
+    out_dir = create_export_dir(nemo_export_dir)
+    storage_type = str_dtype_to_torch(storage_type)
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+
+    # load position_embedding from rank 0
+    model_state_dict = model.get("state_dict", model)
+
+    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
+
+    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
+    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
+    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
+    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
+    hidden_size = nemo_model_config["hidden_size"]
+
+    num_layers = nemo_model_config["num_layers"]
+    training_tp_size = 1
+    training_pp_size = 1
+    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
+    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+    num_attention_heads = nemo_model_config["num_attention_heads"]
+    kv_channels = nemo_model_config.get("kv_channels", None)
+
+    if num_kv_heads == 0:
+        if multi_query_mode:
+            num_kv_heads = 1
+        else:
+            num_kv_heads = num_attention_heads
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "tp_size": training_tp_size,
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
+        and (decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": num_attention_heads,
+        "num_kv_heads": num_kv_heads,
+        "kv_channels": kv_channels,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "use_parallel_embedding": use_parallel_embedding,
+    }
+
+    # split_factor: in how many parts a TP training node is split
+    split_factor = inference_tp_size
+    model_level_weights = defaultdict(list)
+
+    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
+        if tp_idx == 0 and pp_idx == 0:
+            if has_position_embedding:
+                val = model[get_layer_name("position_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["transformer.position_embedding.weight"].append(val)
+        if pp_idx == 0:
+            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+            if embedding_scaling:
+                val = val * float(math.sqrt(hidden_size))
+
+            vocab_size = val.shape[0]
+            if use_parallel_embedding:
+                # Pad vocab_size first
+                if vocab_size % inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["transformer.vocab_embedding.weight"].append(val)
+            if share_embeddings_and_output:
+                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["lm_head.weight"].append(val)
+        if has_lm_head and pp_idx == training_pp_size - 1:
+            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["lm_head.weight"].append(val)
+
+    weights_dict = {}
+
+    tp_rank = 0
+
+    handle_model_level_weights(model, 0, 0)
+    model = extract_layers_with_prefix(model, transformer_layer_prefix)
+
+    starmap_args = []
+    for key, val in model.items():
+        if "_extra_state" not in key:
+            if len(val.size()) == 1:
+                starmap_args.append(
+                    (
+                        tp_rank,
+                        out_dir,
+                        split_factor,
+                        # Let's rename/map the key to the old layer name previously. You can try printing out
+                        # the rename_key output of the old llama checkpoint and compare.
+                        rename_key_dist_ckpt(key, 0),
+                        # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                        [val],
+                        storage_type,
+                        None,
+                        export_config,
+                    )
+                )
+            else:
+                for i in range(num_layers):
+                    starmap_args.append(
+                        (
+                            tp_rank,
+                            out_dir,
+                            split_factor,
+                            # Let's rename/map the key to the old layer name previously. You can try printing out
+                            # the rename_key output of the old llama checkpoint and compare.
+                            rename_key_dist_ckpt(key, i),
+                            # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                            [val[i]],
+                            storage_type,
+                            None,
+                            export_config,
+                        )
+                    )
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+
+    if processes > 1:
+        with multiprocessing.Pool(processes) as pool:
+            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
+            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            weights_dict_local = split_and_save_weight(*starmap_arg)
+
+    weights_dict.update(weights_dict_local)
+
+    for key, values in model_level_weights.items():
+        model_level_weights[key] = np.concatenate(values, axis=0)
+        weights_dict[key] = model_level_weights[key]
+
+    return weights_dict
+
+
+def create_export_dir(nemo_export_dir):
+    out_dir = Path(nemo_export_dir)
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True)
+    return out_dir
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/converter/utils.py
similarity index 67%
rename from nemo/export/trt_llm/nemo/convert.py
rename to nemo/export/trt_llm/converter/utils.py
index 09476da6b939..469d624bdb18 100644
--- a/nemo/export/trt_llm/nemo/convert.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Utilities for exporting a model to our custom format."""
 
 import numpy as np
 import torch
@@ -23,28 +22,22 @@
 weights_dict = {}
 
 
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
+DECODER_MODEL_TYPE = {
+    "gptj": 'GPTForCausalLM',
+    "gptnext": 'GPTForCausalLM',
+    "llama": 'LLaMAForCausalLM',
+    "gemma": 'GemmaForCausalLM',
+    "falcon": 'FalconForCausalLM',
+}
 
 
 def save_val(val, dir, key, tp_num=None):
-    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    suffix = "" if tp_num is None else f".{tp_num}.bin"
     # Transpose linear layer weights to the correct shape.
     if len(val.shape) >= 2:
         val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0]))
     global weights_dict
-    weights_dict[f"model.{key}.{suffix}"] = val
+    weights_dict[f"{key}{suffix}"] = val
 
 
 def save_split(split_vals, dir, key, i, split_factor):
@@ -55,10 +48,10 @@ def save_split(split_vals, dir, key, i, split_factor):
 def save_expert_split(split_vals, dir, key, i, split_factor):
     for j, val in enumerate(split_vals):
         tp_num = i * split_factor + j
-        suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+        suffix = "" if tp_num is None else f".{tp_num}.bin"
 
         global weights_dict
-        weights_dict[f"model.{key}.{suffix}"] = val
+        weights_dict[f"{key}{suffix}"] = val
 
 
 def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
@@ -183,6 +176,9 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
     save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
 
+    layer_num = key.split(".")[1]
+    layer_prefix = f'transformer.layers.{layer_num}'
+
     if not isinstance(vals, list):
         vals = [vals]
 
@@ -210,12 +206,27 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         or "final_layernorm.bias" in key
     ):
         # shared weights, only need to convert the weights of rank 0
-        if "post_self_attn_layernorm.weight" in key:
-            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
-        elif "mlp.linear_fc2.bias" in key:
-            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
-        elif "attention.linear_proj.bias" in key:
-            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
+        if "post_self_attn_layernorm" in key or "post_attention_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.post_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.post_layernorm.bias'
+        elif "mlp.linear_fc2.bias" in key or "mlp.dense_4h_to_h.bias" in key:
+            key = f'{layer_prefix}.mlp.proj.bias'
+        elif "attention.linear_proj.bias" in key or "attention.dense.bias" in key:
+            key = f'{layer_prefix}.attention.dense.bias'
+        elif "final_layernorm" in key:
+            key = key.replace("final_layernorm", "transformer.ln_f")
+        elif "input_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.input_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.input_layernorm.bias'
+        elif "pre_mlp_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.post_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.post_layernorm.bias'
         if tp_rank == 0:
             save_val(vals[0], saved_dir, key)
 
@@ -228,10 +239,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         cat_dim = 0
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
-        if "attention.linear_proj.weight" in key:
-            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
-        elif "mlp.linear_fc2.weight" in key:
-            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
+        if "attention.linear_proj.weight" in key or "attention.dense.weight" in key:
+            key = f'{layer_prefix}.attention.dense.weight'
+        elif "mlp.linear_fc2.weight" in key or "mlp.dense_4h_to_h.weight" in key:
+            key = f'{layer_prefix}.mlp.proj.weight'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
@@ -251,8 +262,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
 
-        if "mlp.linear_fc1" in key:
-            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
+        if key.endswith("weight"):
+            key = f'{layer_prefix}.mlp.fc.weight'
+        else:
+            key = f'{layer_prefix}.mlp.fc.bias'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
@@ -261,8 +274,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
         if split_gated_activation:
             assert not save_int8
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
+            if key.endswith("weight"):
+                key = f'{layer_prefix}.mlp.gate.weight'
+            else:
+                key = f'{layer_prefix}.mlp.gate.bias'
 
             gate = np.concatenate(gates, axis=cat_dim)
             split_vals = np.split(gate, split_factor, axis=cat_dim)
@@ -279,9 +294,6 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
 
     elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
-        if "attention.linear_qkv.bias" in key:
-            key = key.replace("attention.linear_qkv.bias", "attention.query_key_value.bias")
-
         qkv_hidden_dim = vals[0].shape[0]
         size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads)
         q_num = num_attention_heads // num_kv_heads
@@ -304,6 +316,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0)
             for i in range(split_factor)
         ]
+        key = f'{layer_prefix}.attention.qkv.bias'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
 
     elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
@@ -342,8 +355,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             for i in range(split_factor)
         ]
 
-        if "attention.linear_qkv.weight" in key:
-            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
+        key = f'{layer_prefix}.attention.qkv.weight'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if save_int8:
             base_key = key.replace(".weight", "")
@@ -366,8 +378,8 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         pass
     elif "mlp.router.weight" in key:
         val = np.concatenate(vals, axis=1)
-        split_vals = np.split(val, split_factor, axis=0)
-        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        key = f'{layer_prefix}.mlp.router.weight'
+        save_val(val, saved_dir, key)
     elif "experts.linear_fc1.weight" in key:
         cat_dim = -1
         val = np.concatenate(vals, axis=cat_dim)
@@ -378,12 +390,14 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         split_w3s = np.split(w3, split_factor, axis=1)
 
         split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
+        key = f'{layer_prefix}.mlp.experts_weight_1'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
 
     elif "experts.linear_fc2.weight" in key:
         cat_dim = -1
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
+        key = f'{layer_prefix}.mlp.experts_weight_2'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
     else:
         print(f"[WARNING] {key} not handled by converter")
@@ -392,135 +406,11 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
     return weights_dict
 
 
-# Similar to split_save_weight but done on GPU for performance
-@torch.no_grad()
-def save_weight_torch(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config):
-    def save_tranpose(val, key, shared=False):
-        if shared or tp_rank is None:
-            suffix = "bin"
-        else:
-            suffix = f"{tp_rank}.bin"
-
-        # Transpose linear layer weights to the correct shape.
-        assert torch.is_tensor(val)
-        if len(val.shape) >= 2:
-            val = val.reshape(val.shape[0], -1)
-            val = torch.transpose(val, 0, 1)
-        val = val.contiguous().to("cpu", non_blocking=True)
-
-        if type(saved_dir) is dict:
-            saved_dir[f"model.{key}.{suffix}"] = val
-        else:
-            global weights_dict
-            weights_dict[f"model.{key}.{suffix}"] = val
-
-    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
-    split_gated_activation = config.get("split_gated_activation", False)
-    num_attention_heads = config.get("num_attention_heads", 0)
-    tp_size = config.get("tp_size", 1)
-    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
-
-    if not isinstance(vals, list):
-        vals = [vals]
-
-    if config.get("transpose_weights", False) and vals[0].ndim == 2:
-        vals = [val.T for val in vals]
-    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
-        vals = [val + 1.0 for val in vals]
-
-    gpu_vals = [val.to(storage_type) for val in vals]
-    gpu_val = gpu_vals[0]
-
-    if (
-        "input_layernorm.weight" in key
-        or "input_layernorm.bias" in key
-        or "pre_mlp_layernorm.weight" in key
-        or "pre_mlp_layernorm.bias" in key
-        or "attention.dense.bias" in key
-        or "attention.linear_proj.bias" in key
-        or "post_attention_layernorm.weight" in key
-        or "post_attention_layernorm.bias" in key
-        or "post_self_attn_layernorm.weight" in key
-        or "mlp.dense_4h_to_h.bias" in key
-        or "mlp.linear_fc2.bias" in key
-        or "final_layernorm.weight" in key
-        or "final_layernorm.bias" in key
-    ):
-        if "post_self_attn_layernorm.weight" in key:
-            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
-        elif "mlp.linear_fc2.bias" in key:
-            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
-        elif "attention.linear_proj.bias" in key:
-            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
-
-        save_tranpose(gpu_val, key, shared=True)
-    elif (
-        "attention.dense.weight" in key
-        or "mlp.dense_4h_to_h.weight" in key
-        or "attention.linear_proj.weight" in key
-        or "mlp.linear_fc2.weight" in key
-    ):
-        if "attention.linear_proj.weight" in key:
-            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
-        elif "mlp.linear_fc2.weight" in key:
-            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
-        save_tranpose(gpu_val, key)
-    elif (
-        "mlp.dense_h_to_4h.weight" in key
-        or "mlp.dense_h_to_4h.bias" in key
-        or "mlp.linear_fc1.weight" in key
-        or "mlp.linear_fc1.bias" in key
-    ):
-        if split_gated_activation:
-            val, gate = torch.chunk(gpu_val, 2, axis=-1)
-        else:
-            val, gate = None, None
-
-        if "mlp.linear_fc1" in key:
-            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
-
-        save_tranpose(val, key)
-
-        if split_gated_activation:
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
-            save_tranpose(gate, key)
-
-    elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key:
-        save_tranpose(gpu_val, key)
-
-    elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
-        raise NotImplementedError("Attention QKV bias not implemented")
-
-    elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
-        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
-        hidden_dim = vals[0].shape[0]
-        size_per_head = hidden_dim // num_attention_heads
-        q_num = num_attention_heads // num_kv_heads
-
-        len_vals = len(vals)
-        gpu_val = gpu_val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
-
-        # Split the QKV to separate variables.
-        # [qqqqkkvv] - > [qqqq,kk,vv]
-        qkv = torch.split(gpu_val, [q_num, 1, 1], dim=2)
-        split_vals = torch.concatenate(
-            [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
-        )
-
-        if "attention.linear_qkv.weight" in key:
-            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
-        save_tranpose(split_vals, key)
-
-    elif (
-        "attention.query.weight" in key
-        or "attention.query.bias" in key
-        or "attention.key_value.weight" in key
-        or "attention.key_value.bias" in key
-    ):
-        pass
+def split(v, tp_size, idx, dim=0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
     else:
-        print(f"[WARNING] {key} not handled by converter")
-
-    global weights_dict
-    return weights_dict
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py
deleted file mode 100644
index 5fe749408cb9..000000000000
--- a/nemo/export/trt_llm/decoder/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Type
-
-import tensorrt as trt
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.falcon import FALCONDecoderLayerBuilder, FALCONDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gemma import GemmaDecoderLayerBuilder, GemmaDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gpt import GPTDecoderLayerBuilder, GPTDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gptj import GPTJDecoderLayerBuilder, GPTJDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.llama import LLAMADecoderLayerBuilder, LLAMADecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    DECODER_FALCON,
-    DECODER_GEMMA,
-    DECODER_GPT2,
-    DECODER_GPTJ,
-    DECODER_GPTNEXT,
-    DECODER_LLAMA,
-    QUANTIZATION_NONE,
-)
-
-DECODER_CONFIG_REGISTRY: Dict[str, Type[DecoderLayerConfigBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerConfigBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerConfigBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerConfigBuilder,
-    DECODER_FALCON: FALCONDecoderLayerConfigBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerConfigBuilder,
-}
-
-
-def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1):
-    """Builds the decoder layer config with the input torch module."""
-    assert decoder in DECODER_CONFIG_REGISTRY, f"{decoder} not supported"
-    return DECODER_CONFIG_REGISTRY[decoder](decoder, dtype, rank, tensor_parallel).build_layer(layer)
-
-
-DECODER_REGISTRY: Dict[str, Type[DecoderLayerBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerBuilder,
-    DECODER_GPTNEXT: GPTDecoderLayerBuilder,
-    DECODER_FALCON: FALCONDecoderLayerBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerBuilder,
-}
-
-
-def build_decoder_layer(
-    layer,
-    layer_id: int,
-    num_layers: int,
-    dtype=trt.float16,
-    quantization=QUANTIZATION_NONE,
-    rank=0,
-    tensor_parallel=1,
-    tp_group=None,
-):
-    """Builds the tensorrt llm decoder layer module with the layer config as the input."""
-    assert layer.decoder_type in DECODER_REGISTRY, f"{layer.decoder_type} not supported"
-    builder = DECODER_REGISTRY[layer.decoder_type]
-    decoder_builder = builder(layer, layer_id, num_layers, dtype, quantization, rank, tensor_parallel, tp_group)
-    return decoder_builder.decoder
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
deleted file mode 100644
index b3c0e2257e9f..000000000000
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from abc import ABC, abstractmethod
-from typing import Optional
-
-import tensorrt as trt
-from transformers.activations import ACT2FN
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_NONE,
-    AttentionConfig,
-    DecoderLayerConfig,
-    LayernormConfig,
-    MLPConfig,
-)
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-
-def _get_hidden_act(act_func):
-    """Returns the name of the hidden activation functon based on ACT2FN."""
-    if isinstance(act_func, str):
-        return act_func
-
-    for name, func in ACT2FN.items():
-        if isinstance(func, tuple):
-            if isinstance(act_func, func[0]):
-                return name
-        elif isinstance(act_func, func):
-            return name
-    assert False, f"Cannot find name for {act_func}"
-
-
-class DecoderLayerConfigBuilder(ABC):
-    """A config builder that translate the LLM decoder layer to the DecoderLayerConfig."""
-
-    @abstractmethod
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        pass
-
-    @abstractmethod
-    def infer_num_attention_heads(self, layer):
-        """Returns the num of attention heads of the layer."""
-        pass
-
-    @abstractmethod
-    def infer_max_position_embeddings(self, layer):
-        """Returns the max positional embeddings of the layer."""
-        pass
-
-    @abstractmethod
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        """Returns the built input layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp_layernorm(
-        self, layer
-    ) -> LayernormConfig:  # Force all other models to implement. But seems this builder is not used.
-        """Returns the built mlp layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_attention(self, layer) -> AttentionConfig:
-        """Returns the built attention layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp(self, layer) -> MLPConfig:
-        """Returns the built mlp layer."""
-        pass
-
-    @abstractmethod
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        """Returns the built post layernorm."""
-        pass
-
-    def __init__(
-        self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
-    ):
-        """Initializes the DecoderLayerConfigBuilder."""
-        self.decoder_type = decoder_type
-        self.dtype = dtype
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-    def build_layer(self, layer) -> DecoderLayerConfig:
-        """Builds the decoder layer and returns the DecoderLayer."""
-        decoder = DecoderLayerConfig()
-
-        decoder.decoder_type = self.decoder_type
-        decoder.num_attention_heads = self.infer_num_attention_heads(layer)
-        decoder.num_kv_heads = self.infer_num_kv_heads(layer)
-        decoder.max_position_embeddings = self.infer_max_position_embeddings(layer)
-
-        decoder.input_layernorm = self.build_input_layernorm(layer)
-        decoder.mlp_layernorm = self.build_mlp_layernorm(layer)
-        decoder.attention = self.build_attention(layer)
-        decoder.post_layernorm = self.build_post_layernorm(layer)
-        decoder.mlp = self.build_mlp(layer)
-        decoder.mlp.hidden_act = _get_hidden_act(self.hidden_act_fn(layer)).split("_")[0]
-
-        return decoder
-
-    def infer_num_kv_heads(self, layer):
-        """Returns the num of key value heads of the layer."""
-        return self.infer_num_attention_heads(layer)
-
-
-class DecoderLayerBuilder(ABC):
-    """An abstracted transformer decoder layer with tensorrt_llm implementation taking DecoderLayerConfig as the input.
-
-    Individual decoder layers are supposed to extend this class and implement the customized
-    abstracted method.
-    """
-
-    @abstractmethod
-    def build_decoder(self, layer):
-        """Returns the built decoder layer."""
-        pass
-
-    def __init__(
-        self,
-        layer: DecoderLayerConfig,
-        layer_id: int,
-        num_layers: int,
-        dtype: trt.DataType = trt.float16,
-        quantization: str = QUANTIZATION_NONE,
-        rank: int = 0,
-        tensor_parallel: int = 1,
-        tp_group=None,
-    ):
-        """Initializes the DecoderLayer."""
-        super().__init__()
-        assert isinstance(dtype, trt.DataType)
-        self.layer_id = layer_id
-        self.num_layers = num_layers
-        self.dtype = dtype
-        self.quantization = quantization
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-        if tp_group is None:
-            self.tp_group = get_tensor_parallel_group(tensor_parallel)
-        else:
-            self.tp_group = tp_group
-
-        self.hidden_size = layer.hidden_size
-        self.num_attention_heads = layer.num_attention_heads
-        self.num_kv_heads = layer.num_kv_heads if layer.num_kv_heads > 0 else layer.num_attention_heads
-
-        assert (
-            self.num_attention_heads % self.num_kv_heads
-        ) == 0, "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
-        assert (self.num_kv_heads % self.tensor_parallel) == 0 or (self.tensor_parallel % self.num_kv_heads) == 0, (
-            "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs"
-            " OR the number of GPUs to be divisible by the number of K/V heads."
-        )
-
-        self.max_position_embeddings = layer.max_position_embeddings
-        self.hidden_act = layer.mlp.hidden_act
-
-        self.decoder = self.build_decoder(layer)
-        self.assign_weights(layer)
-
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-        if not is_moe:
-            self.quantize(layer)
-
-    def assign_weights(self, layer: DecoderLayerConfig):
-        """Assign the weights to the attention tensorrt_llm layer."""
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-
-        self.decoder.input_layernorm.weight.value = layer.input_layernorm.weight
-        if layer.input_layernorm.bias is not None:
-            self.decoder.input_layernorm.bias.value = layer.input_layernorm.bias
-
-        if layer.mlp_layernorm is not None:  # Falcon has mlp layer norm
-            if is_moe:
-                assert layer.post_layernorm is None
-                self.decoder.post_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.post_layernorm.bias.value = layer.mlp_layernorm.bias
-            else:
-                self.decoder.mlp_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.mlp_layernorm.bias.value = layer.mlp_layernorm.bias
-
-        self.decoder.attention.qkv.weight.value = layer.attention.qkv.weight
-        if layer.attention.qkv.bias is not None:
-            self.decoder.attention.qkv.bias.value = layer.attention.qkv.bias
-
-        self.decoder.attention.dense.weight.value = layer.attention.dense.weight
-        if self.decoder.attention.dense.bias is not None:
-            self.decoder.attention.dense.bias.value = layer.attention.dense.bias
-
-        if layer.post_layernorm is not None:
-            self.decoder.post_layernorm.weight.value = layer.post_layernorm.weight
-            if layer.post_layernorm.bias is not None:
-                self.decoder.post_layernorm.bias.value = layer.post_layernorm.bias
-
-        if is_moe:
-            self.decoder.mlp.router.weight.value = layer.mlp.router.weight
-            self.decoder.mlp.experts_weight_1.value = layer.mlp.fc1.weight
-            self.decoder.mlp.experts_weight_2.value = layer.mlp.fc2.weight
-
-            if layer.mlp.fc1.bias is not None:
-                self.decoder.mlp.experts_bias_1.value = layer.mlp.fc1.bias
-
-            if layer.mlp.fc2.bias is not None:
-                self.decoder.mlp.experts_bias_2.value = layer.mlp.fc2.bias
-
-        else:
-            self.decoder.mlp.fc.weight.value = layer.mlp.fc.weight
-            self.decoder.mlp.proj.weight.value = layer.mlp.proj.weight
-            bias = layer.mlp.fc.bias is not None
-            if bias:
-                self.decoder.mlp.fc.bias.value = layer.mlp.fc.bias
-                self.decoder.mlp.proj.bias.value = layer.mlp.proj.bias
-
-            if layer.mlp.gate:
-                self.decoder.mlp.gate.weight.value = layer.mlp.gate.weight
-                if bias:
-                    self.decoder.mlp.gate.bias.value = layer.mlp.gate.bias
-
-    def quantize(self, layer: DecoderLayerConfig):
-        """Quantizes the decoder layer based on the layer config."""
-        self.decoder.attention.qkv = quantize_linear(
-            self.decoder.attention.qkv, self.quantization, layer.attention.qkv
-        )
-        self.decoder.attention.dense = quantize_linear(
-            self.decoder.attention.dense, self.quantization, layer.attention.dense
-        )
-        self.decoder.mlp.fc = quantize_linear(self.decoder.mlp.fc, self.quantization, layer.mlp.fc)
-        self.decoder.mlp.proj = quantize_linear(self.decoder.mlp.proj, self.quantization, layer.mlp.proj)
-
-        if hasattr(self.decoder.mlp, "gate"):
-            self.decoder.mlp.gate = quantize_linear(self.decoder.mlp.gate, self.quantization, layer.mlp.gate)
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
deleted file mode 100644
index 91edc7794607..000000000000
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.falcon.model import FalconDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class FALCONDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The FALCON implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_mlp_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.mlp_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class FALCONDecoderLayerBuilder(DecoderLayerBuilder):
-    """The FALCON implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        # Falcon 7B: parallel_attention=True, new_decoder_architecture=False
-        # Falcon 40B/180B: parallel_attention=True, new_decoder_architecture=True
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-            use_parallel_embedding=False,
-        )
-
-        # No other way to pass in model variant config, determine model variant by num_layers (7B: 32 layers)
-        config.set_if_not_exist('new_decoder_architecture', False if self.num_layers == 32 else True)
-        config.set_if_not_exist('parallel_attention', True)
-        config.set_if_not_exist('layernorm_epsilon', 1e-5)
-        config.set_if_not_exist('bias', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
deleted file mode 100644
index 10301c7a47d7..000000000000
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.gemma.model import GemmaDecoderLayer, QuantConfig
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class GemmaDecoderLayerBuilder(DecoderLayerBuilder):
-    """The Gemma implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            head_size=layer.kv_channels,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
deleted file mode 100644
index 8af4e4ef01e4..000000000000
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType
-from tensorrt_llm.models.gpt.model import GPTDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPT2 implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.ln_2, dtype=self.dtype)
-
-
-class GPTDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPT implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_pct = layer.rotary_pct
-
-        position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute"
-
-        assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0)
-
-        bias_qkv = layer.attention.qkv.bias is not None
-
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act,
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type=position_embedding_type,
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('hidden_act', self.hidden_act)
-        config.set_if_not_exist('apply_query_key_layer_scaling', False)
-        config.set_if_not_exist('bias', bias_qkv)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('rotary_pct', rotary_pct)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
deleted file mode 100644
index aa65ca385a47..000000000000
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.models.gptj.model import GPTJDecoderLayer
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTJDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPTJ implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_attention_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.q_proj, layer.attn.k_proj, layer.attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        config.rotary_dim = layer.attn.rotary_dim
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        # GPTJ do not have post layer_norm
-        return None
-
-
-class GPTJDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPTJ implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        assert self.tensor_parallel == 1 and self.rank == 0, "Only single GPU is supported for GPTJ"
-
-        return GPTJDecoderLayer(
-            hidden_size=self.hidden_size,
-            num_attention_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            rotary_dim=layer.attention.rotary_dim,
-            dtype=self.dtype,
-            hidden_act=self.hidden_act,
-            tp_group=self.tp_group,
-            tp_size=self.tensor_parallel,
-            max_lora_rank=layer.max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
deleted file mode 100644
index 873c0306375b..000000000000
--- a/nemo/export/trt_llm/decoder/llama.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.layers import MoeConfig
-from tensorrt_llm.models.llama.model import LLaMADecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class LLAMADecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class LLAMADecoderLayerBuilder(DecoderLayerBuilder):
-    """The LLAMA implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        if layer.moe_num_experts:
-            if layer.moe_num_experts is not None:
-                if layer.moe_top_k is None:
-                    layer.moe_top_k = 1
-
-                layer.moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL if layer.moe_tp_mode is None else None
-                layer.moe_renorm_mode = (
-                    MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE if layer.moe_renorm_mode is None else None
-                )
-                moe_config = MoeConfig(
-                    layer.moe_num_experts, layer.moe_top_k, layer.moe_tp_mode, layer.moe_renorm_mode
-                )
-                moe_config.validate()
-                config.moe_num_experts = layer.moe_num_experts
-                config.moe_top_k = layer.moe_top_k
-                config.moe_tp_mode = layer.moe_tp_mode
-                config.moe_normalization_mode = layer.moe_renorm_mode
-
-        return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
deleted file mode 100644
index dd360afd6b8a..000000000000
--- a/nemo/export/trt_llm/model_config.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-from dataclasses import dataclass, field
-from typing import Dict, List, get_args, get_origin
-
-import numpy as np
-import tensorrt as trt
-import torch.nn as nn
-from tensorrt_llm._utils import pad_vocab_size
-from tensorrt_llm.functional import is_gated_activation
-from transformers import LlamaConfig, PretrainedConfig
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, split, torch_to_numpy_with_dtype
-
-
-DECODER_GPT2 = "gpt2"
-DECODER_GPTJ = "gptj"
-DECODER_LLAMA = "llama"
-DECODER_GPTNEXT = "gptnext"
-DECODER_FALCON = "falcon"
-DECODER_GEMMA = "gemma"
-
-QUANTIZATION_NONE = ""
-QUANTIZATION_FP8 = "fp8"
-QUANTIZATION_INT8_SQ = "int8_sq"
-
-LINEAR_COLUMN = "column"
-LINEAR_ROW = "row"
-
-LAYERNORM_DEFAULT = ""
-LAYERNORM_RMS = "rms"
-
-LAYER_DEFAULT = ""
-LAYER_QKV = "qkv"
-
-
-@dataclass
-class EmbeddingConfig:
-    """The embedding layer config."""
-
-    weight: np.array = None
-    # Whether the embedding weights are local
-    is_local: bool = False
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an EmbeddingConfig."""
-        return EmbeddingConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype))
-
-    @property
-    def local_vocab_size(self):
-        """Infers the vocab_size from the embedding layer weights shape."""
-        return self.weight.shape[0]
-
-    @property
-    def hidden_size(self):
-        """Infers the hidden_size from the embedding layer weights shape."""
-        return self.weight.shape[1]
-
-
-@dataclass
-class LayernormConfig:
-    """The layernorm layer config."""
-
-    weight: np.array = None
-    bias: np.array = None
-    layernorm_type: str = LAYERNORM_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an LayernormConfig."""
-        layernorm_type = LAYERNORM_RMS if type(module) is LlamaRMSNorm else LAYERNORM_DEFAULT
-
-        config = LayernormConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype), layernorm_type=layernorm_type)
-        if layernorm_type == LAYERNORM_DEFAULT:
-            config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-
-@dataclass
-class LinearConfig:
-    """The linear layer config."""
-
-    linear_type: str = ""
-    weight: np.array = None
-    bias: np.array = None
-    activation_scaling_factor: np.array = None
-    weights_scaling_factor: np.array = None
-    prequant_scaling_factor: np.array = None
-    layer_type: str = LAYER_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts an nn.Module to an LinearConfig."""
-        weight = torch_to_numpy_with_dtype(module.weight, dtype)
-        if "Conv1D" in type(module).__name__:
-            weight = weight.transpose()
-        else:
-            assert type(module) is nn.Linear
-
-        config = LinearConfig()
-        config.linear_type = linear_type
-        config.weight = np.ascontiguousarray(
-            split(weight, tensor_parallel, rank, dim=0 if linear_type == LINEAR_COLUMN else 1)
-        )
-
-        if hasattr(module, "bias") and module.bias is not None:
-            if linear_type == LINEAR_COLUMN:
-                config.bias = np.ascontiguousarray(
-                    split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,)
-                )
-            else:
-                config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-    @staticmethod
-    def from_qkv_nn_modules(qkv_modules: List[nn.Module], rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts the qkv modules to an LinearConfig."""
-        config = LinearConfig()
-        config.linear_type = LINEAR_COLUMN
-        config.layer_type = LAYER_QKV
-        if len(qkv_modules) == 1:
-            # QKV layers combined as a single module, e.g. GPT2
-            qkv_module = qkv_modules[0]
-            assert "Conv1D" in type(qkv_module).__name__
-
-            qkv_shape = qkv_module.weight.shape
-            # Decode the concat QKV weights and split them to different GPU rank.
-            config.weight = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.weight, dtype=dtype).reshape(
-                        qkv_shape[0], 3, qkv_shape[-1] // 3
-                    ),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                )
-                .reshape(qkv_shape[0], -1)
-                .transpose()
-            )
-            config.bias = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.bias, dtype=dtype).reshape(3, qkv_shape[-1] // 3),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                ).reshape(-1)
-            )
-
-        elif len(qkv_modules) == 3:
-            # Separate QKV layers
-            for m in qkv_modules:
-                assert type(m) is nn.Linear
-                assert not (hasattr(m, "bias") and m.bias is not None)
-
-            q_weight = split(torch_to_numpy_with_dtype(qkv_modules[0].weight), tensor_parallel, rank)
-            k_weight = split(torch_to_numpy_with_dtype(qkv_modules[1].weight), tensor_parallel, rank)
-            v_weight = split(torch_to_numpy_with_dtype(qkv_modules[2].weight), tensor_parallel, rank)
-            split_v = np.concatenate((q_weight, k_weight, v_weight))
-            config.weight = np.ascontiguousarray(split_v)
-
-        else:
-            assert False, f"QKV modules format {qkv_modules} not supported"
-
-        return config
-
-
-@dataclass
-class MoEMLPConfig:
-    """The MLP layer config."""
-
-    fc1: LinearConfig = None
-    fc2: LinearConfig = None
-    router: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MoEMLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc1 = LinearConfig(linear_type=LINEAR_COLUMN)
-
-        mlp.fc1.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.weight.{rank}"
-        )
-
-        mlp.fc1.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.bias.{rank}"
-        )
-
-        mlp.fc2 = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.fc2.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.weight.{rank}"
-        )
-        mlp.fc2.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.bias.{rank}"
-        )
-
-        mlp.router = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.router.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.router.weight.{rank}")
-        return mlp
-
-
-@dataclass
-class AttentionConfig:
-    """The attention layer config."""
-
-    qkv: LinearConfig = None
-    dense: LinearConfig = None
-
-    rotary_dim: int = -np.inf
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0,
-    ):
-        """Converts the nemo weights and config to `AttentionConfig`."""
-        attention = AttentionConfig()
-        attention.qkv = LinearConfig(linear_type=LINEAR_COLUMN, layer_type=LAYER_QKV)
-        attention.qkv.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
-        )
-        attention.qkv.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
-        )
-
-        attention.dense = LinearConfig(linear_type=LINEAR_ROW)
-        attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",)
-        return attention
-
-
-@dataclass
-class MLPConfig:
-    """The MLP layer config."""
-
-    fc: LinearConfig = None
-    gate: LinearConfig = None
-    proj: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc = LinearConfig(linear_type=LINEAR_COLUMN)
-        mlp.fc.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.weight.{rank}")
-
-        # print("********** mlp.fc.weight : ", mlp.fc.weight )
-
-        mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",)
-
-        gated = is_gated_activation(mlp.hidden_act)
-        is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
-        if gated:
-            mlp.gate = LinearConfig(linear_type=LINEAR_COLUMN)
-            layer_name = (
-                f"layers.{layer_id}.mlp.dense_h_to_4h_2.weight.{rank}"
-                if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
-                else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
-            )
-            mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,)
-            mlp.gate.bias = get_tensor_from_dict(
-                weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
-            )
-
-        mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.proj.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.weight.{rank}")
-        mlp.proj.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.bias")
-        return mlp
-
-
-@dataclass
-class DecoderLayerConfig:
-    """The decoder layer config."""
-
-    decoder_type: str = ""
-    input_layernorm: LayernormConfig = None
-    mlp_layernorm: LayernormConfig = None  # Falcon 40B/180B has mlp_layernorm
-    attention: AttentionConfig = None
-    post_layernorm: LayernormConfig = None
-    mlp: MLPConfig = None
-
-    num_attention_heads: int = 0
-
-    num_kv_heads: int = 0
-    kv_channels: int = None
-    max_position_embeddings: int = 0
-    rotary_pct: float = 0
-    rotary_base: int = 10000
-    rotary_scaling: float = None
-    position_embedding_type: str = None
-
-    moe_num_experts: int = None
-    moe_top_k: int = None
-    moe_tp_mode: int = None
-    moe_renorm_mode: int = None
-
-    vocab_size: int = 0
-    norm_epsilon: float = 0.0
-    max_lora_rank: int = 64
-
-    @property
-    def is_moe(self):
-        return self.moe_num_experts is not None and self.moe_num_experts > 1
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[1]
-        else:
-            return self.mlp.fc.weight.shape[1]
-
-    @property
-    def ffn_hidden_size_local(self):
-        """Returns the ffn hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[-1]
-        else:
-            return self.mlp.fc.weight.shape[0]
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        decoder_type: str,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `DecoderLayerConfig`."""
-        layer_config = DecoderLayerConfig(
-            decoder_type=decoder_type,
-            num_attention_heads=llm_config.n_head,
-            max_position_embeddings=llm_config.n_positions,
-            rotary_pct=llm_config.rotary_pct if hasattr(llm_config, "rotary_pct") else 1.0,
-            rotary_base=(llm_config.rotary_base if hasattr(llm_config, "rotary_base") else 10000),
-            rotary_scaling=(llm_config.rotary_scaling if hasattr(llm_config, "rotary_scaling") else None),
-            position_embedding_type=(
-                llm_config.position_embedding_type if hasattr(llm_config, "position_embedding_type") else None
-            ),
-            num_kv_heads=(llm_config.num_kv_heads if hasattr(llm_config, "num_kv_heads") else 0),
-            kv_channels=(llm_config.kv_channels if hasattr(llm_config, "kv_channels") else None),
-            moe_num_experts=(llm_config.moe_num_experts if hasattr(llm_config, "moe_num_experts") else None),
-            moe_top_k=(llm_config.moe_top_k if hasattr(llm_config, "moe_top_k") else None),
-            moe_tp_mode=(llm_config.moe_tp_mode if hasattr(llm_config, "moe_tp_mode") else None),
-            moe_renorm_mode=(llm_config.moe_renorm_mode if hasattr(llm_config, "moe_renorm_mode") else None),
-            vocab_size=llm_config.vocab_size,
-            norm_epsilon=llm_config.norm_epsilon,
-        )
-        layer_config.input_layernorm = LayernormConfig()
-        layer_config.input_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.weight",
-        )
-        layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.bias",
-        )
-
-        layer_config.mlp_layernorm = LayernormConfig()
-        layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
-        layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight",
-        )
-        layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias",
-        )
-
-        layer_config.post_layernorm = LayernormConfig()
-        layer_config.post_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-
-        layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight",
-        )
-        layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias",
-        )
-
-        if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
-            layer_config.post_layernorm = None
-
-        if layer_config.mlp_layernorm.weight is None:
-            layer_config.mlp_layernorm = None
-
-        layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,)
-
-        moe = False
-        if llm_config.moe_num_experts is not None:
-            if llm_config.moe_num_experts > 1:
-                moe = True
-
-        if moe:
-            layer_config.mlp = MoEMLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-        else:
-            layer_config.mlp = MLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-
-        return layer_config
-
-
-def _from_dict(class_type, data):
-    """Helper function to load the data as a class_type. class_type must be a dataclass."""
-    if data is None:
-        return None
-
-    if dataclasses.is_dataclass(class_type):
-        fieldtypes = {f.name: f.type for f in dataclasses.fields(class_type)}
-        return class_type(**{f: _from_dict(fieldtypes[f], data[f]) for f in data})
-    elif get_origin(class_type) == list and dataclasses.is_dataclass(get_args(class_type)[0]):
-        list_value = []
-        for child in data:
-            child_class_type = get_args(class_type)[0]
-            list_value.append(_from_dict(child_class_type, child))
-        return list_value
-    else:
-        return data
-
-
-@dataclass
-class ModelConfig:
-    """The full LLM model config that includes the full information needed for tensorrt_llm engine building.
-
-    This class includes all the fields that tensorrt_llm supports, but not all of the fields are required.
-    """
-
-    # Global metadata
-    quantization: str = QUANTIZATION_NONE
-    dtype: str = "float16"
-
-    # Model structure and weights
-    vocab_embedding: EmbeddingConfig = None
-    positional_embedding: EmbeddingConfig = None
-    layers: List[DecoderLayerConfig] = field(default_factory=list)
-    final_layernorm: LayernormConfig = None
-    lm_head: LinearConfig = None
-
-    # Ptuning metadata
-    use_prompt_tuning: bool = False
-    use_parallel_embedding: bool = False
-    max_lora_rank: int = 64
-
-    # Parallel metadata
-    mapping = None
-
-    def to_dict(self) -> dict:
-        """Converts the instance to a python dict."""
-        return dataclasses.asdict(self)
-
-    @staticmethod
-    def from_dict(d: dict):
-        """Load a dict to a `ModelConfig` instance."""
-        return _from_dict(ModelConfig, d)
-
-    @property
-    def vocab_size(self):
-        """Returns the vocab_size of the model."""
-        return (
-            self.vocab_embedding.local_vocab_size * self.mapping.tp_size
-            if self.vocab_embedding.is_local
-            else self.vocab_embedding.local_vocab_size
-        )
-
-    @property
-    def vocab_size_padded(self):
-        """Returns the padded vocab_size of the model rounds to the tensor_parallel."""
-        return pad_vocab_size(self.vocab_size, self.mapping.tp_size)
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden_size of the model."""
-        return self.vocab_embedding.hidden_size
-
-    @property
-    def max_position_embeddings(self):
-        """Returns the max_position_embedding of the model."""
-        return self.layers[0].max_position_embeddings
-
-    @property
-    def num_attention_heads(self):
-        """Returns the num_attention_heads of the model."""
-        return self.layers[0].num_attention_heads
-
-    @property
-    def num_kv_heads(self):
-        """Returns the num_key_value_heads of the model."""
-        return self.layers[0].num_kv_heads if self.layers[0].num_kv_heads > 0 else self.num_attention_heads
-
-    @property
-    def head_size(self):
-        """Returns the head_size of the model."""
-        return self.layers[0].kv_channels
-
-    @property
-    def hidden_act(self):
-        """Returns the hidden_act of the model."""
-        return self.layers[0].mlp.hidden_act
diff --git a/nemo/export/trt_llm/model_config_trt.py b/nemo/export/trt_llm/model_config_trt.py
deleted file mode 100644
index 635f6ae4d807..000000000000
--- a/nemo/export/trt_llm/model_config_trt.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-from pathlib import Path
-from typing import List, Union
-
-from nemo.export.trt_llm.model_config import ModelConfig
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-
-def model_config_to_tensorrt_llm(
-    model_configs: List[ModelConfig],
-    engine_dir: Union[str, Path],
-    world_size: int = 1,
-    max_input_len: int = 200,
-    max_output_len: int = 200,
-    max_batch_size: int = 1,
-    max_beam_width: int = 1,
-    max_prompt_embedding_table_size: int = 0,
-    use_inflight_batching: bool = False,
-    paged_kv_cache: bool = False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode: bool = False,
-    use_refit: bool = False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """The API to convert a torch or huggingface model represented as ModelConfig to tensorrt_llm.
-
-    Args:
-        model_configs: The list of ModelConfig converted, 1 for each GPU.
-        engine_dir: The target output directory to save the built tensorrt_llm engines.
-        gpus: the number of inference gpus for multi gpu inferencing.
-        max_input_len: The max input sequence length.
-        max_output_len: The max output sequence length.
-        max_batch_size: The max batch size.
-        max_beam_width: The max beam search width.
-        max_prompt_embedding_table_size: max size of the prompt embedding table.
-        use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-        paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
-        enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
-        enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
-    """
-    engine_dir = Path(engine_dir)
-    if os.path.exists(engine_dir):
-        shutil.rmtree(engine_dir)
-
-    for rank in range(world_size):
-        model_configs[rank].use_prompt_tuning = max_prompt_embedding_table_size > 0
-        model_configs[rank].max_lora_rank = max_lora_rank
-        builder = LMHeadModelBuilder(model_configs[rank])
-        builder.build(
-            output_dir=engine_dir,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            parallel_build=False,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
deleted file mode 100644
index 9026cd9cfba9..000000000000
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import functools
-import logging
-import pathlib
-import typing
-
-import torch
-import yaml
-from transformers import FalconConfig, GPT2Config, LlamaConfig
-
-from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.nemo.convert import cpu_map_location, gpu_map_location
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type):
-    convertion_dict = {
-        "activation_function": "activation",
-        "layer_norm_epsilon": "layernorm_epsilon",
-        "n_embd": "hidden_size",
-        "n_head": "num_attention_heads",
-        "n_layer": "num_layers",
-        "n_positions": "max_position_embeddings",
-        "rotary_pct": "rotary_percentage",
-        "rotary_base": "rotary_base",
-        "rotary_scaling": "seq_len_interpolation_factor",
-        "position_embedding_type": "position_embedding_type",
-        "bias": "bias",
-        "intermediate_size": "ffn_hidden_size",
-        "num_kv_heads": "num_query_groups",
-        "moe_num_experts": "num_moe_experts",
-        "moe_top_k": "moe_router_topk",
-        "moe_renorm_mode": "moe_renorm_mode",
-        "kv_channels": "kv_channels",
-        "norm_epsilon": "layernorm_epsilon",
-    }
-
-    kwargs = {key: nemo_model_config[value] for key, value in convertion_dict.items() if value in nemo_model_config}
-    kwargs["vocab_size"] = vocab_size
-    kwargs["eos_token_id"] = eos_id
-    kwargs["bos_token_id"] = eos_id if decoder_type == 'falcon' else bos_id  # in HF falcon eos==bos
-    if "moe_num_experts" not in kwargs:
-        kwargs["moe_num_experts"] = 0
-    config_dict = {"llama": LlamaConfig, "falcon": FalconConfig, "gemma": LlamaConfig}
-    llm_config = config_dict[decoder_type] if decoder_type in config_dict else GPT2Config
-
-    return llm_config(**kwargs)
-
-
-def add_special_tokens_to_tokenizer(tokenizer):
-    # Need to add cls, sep, mask tokens to the tokenizer if they don't exist.
-    # If cls, sep and mask are not attributes of the tokenizer, add it.
-    if not hasattr(tokenizer, "cls_token"):
-        tokenizer.add_special_tokens({"cls_token": "<cls>"})
-    if not hasattr(tokenizer.tokenizer, "sep_id"):
-        tokenizer.add_special_tokens({"sep_token": "<sep>"})
-    if not hasattr(tokenizer.tokenizer, "mask_id"):
-        tokenizer.add_special_tokens({"mask_token": "<mask>"})
-
-    # bos, eos, pad and unk may be present in the provided spm .model file, if they are, use it.
-    if not hasattr(tokenizer, "pad_token"):
-        if hasattr(tokenizer.tokenizer, "pad_id") and tokenizer.tokenizer.pad_id() > 0:
-            tokenizer.pad_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.pad_id())
-        else:
-            tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    else:
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    if not hasattr(tokenizer, "bos_token"):
-        if hasattr(tokenizer.tokenizer, "bos_id") and tokenizer.tokenizer.bos_id() > 0:
-            tokenizer.bos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.bos_id())
-        else:
-            tokenizer.add_special_tokens({"bos_token": "<bos>"})
-    else:
-        tokenizer.add_special_tokens({"bos_token": "<s>"})
-
-    if not hasattr(tokenizer, "eos_token"):
-        if hasattr(tokenizer.tokenizer, "eos_id") and tokenizer.tokenizer.eos_id() > 0:
-            tokenizer.eos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.eos_id())
-        else:
-            tokenizer.add_special_tokens({"eos_token": "<eos>"})
-    else:
-        tokenizer.add_special_tokens({"eos_token": "</s>"})
-
-
-def extract_layers_with_prefix(model_, prefix):
-    length_to_trim = len(prefix)
-    model_state = model_.get("state_dict", model_)
-    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if prefix in key}
-
-
-class UnpackedNemoCheckpointDir:
-    def __init__(
-        self, checkpoints_dir: typing.Union[pathlib.Path, TarPath], load_checkpoints_to_cpu: bool = False,
-    ):
-        assert isinstance(checkpoints_dir, (pathlib.Path, TarPath))
-        self._checkpoints_dir = checkpoints_dir
-        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
-
-    @property
-    @functools.lru_cache
-    def model_config(self):
-        model_config = None
-
-        model_config_filename = "model_config.yaml"
-        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
-        if model_configs_paths:
-            if len(model_configs_paths) > 1:
-                raise RuntimeError(
-                    f"There are more than single {model_config_filename} in"
-                    f" {self._checkpoints_dir}:"
-                    f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}"
-                )
-            model_config_path = model_configs_paths[0]
-            LOGGER.debug("Loading model config from %s", model_config_path)
-            with model_config_path.open("r") as model_config_file:
-                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
-        else:
-            LOGGER.debug("Searching model config in checkpoints")
-            # try to obtain from checkpoint
-            checkpoint_name = self.checkpoint_name
-            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
-            if checkpoints_paths:
-                # assume that parallel ranks 0 checkpoint should have model config embedded
-                checkpoint_path = checkpoints_paths[0]
-
-                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
-
-                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
-                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
-                    model_config = model_00["hyper_parameters"]["cfg"]
-                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
-                else:
-                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
-
-                del model_00
-
-        if model_config is None:
-            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
-
-        LOGGER.debug("Loaded model config %s", model_config)
-
-        return model_config
-
-    @property
-    def checkpoints_dir(self):
-        return self._checkpoints_dir
-
-    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
-        """Injects tensor/pipeline model parallel ranks into the filepath.
-        Does nothing if not using model parallelism.
-        """
-        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
-
-        def _inject_parallel_ranks(tp_rank, pp_rank):
-            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
-                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"mp_rank_{tp_rank:02d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                else:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                return checkpoint_path
-            else:
-                return checkpoint_path_without_rank
-
-        return [
-            [
-                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
-                for pp_rank in range(pipeline_model_parallel_size)
-            ]
-            for tp_rank in range(tensor_model_parallel_size)
-        ]
-
-    @property
-    @functools.lru_cache
-    def checkpoint_name(self):
-        patterns = [
-            "model_weights.ckpt",  # older megatron checkpoints
-            "*last.ckpt",  # newer format of checkpoints
-        ]
-        for pattern in patterns:
-            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
-            if model_files:
-                return model_files[0].name
-
-        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
-
-    @functools.lru_cache
-    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
-        model_config = self.model_config
-        file_property = None
-        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
-            file_property = model_config[tokenizer_key][file_key]
-        elif file_key in model_config:
-            file_property = model_config[file_key]
-
-        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
-
-        if file_property and file_property.startswith("nemo:"):
-            filename = file_property.split("nemo:")[1]
-            filename_pattern = f"*{filename}"
-        elif file_property and file_property.startswith("/artifacts/"):
-            filename = pathlib.Path(file_property).name
-            filename_pattern = f"*{filename}"
-        elif file_property is None or file_property == "None":
-            filename_pattern = None
-        else:
-            filename_pattern = default_filename_pattern
-            LOGGER.warning(
-                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
-                f"looks like unsupported path. Pattern {filename_pattern} will be used."
-            )
-
-        file_path = None
-        if filename_pattern is not None:
-            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
-            if files_paths:
-                assert len(files_paths) == 1
-                file_path = files_paths[0]
-
-        return file_path
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
deleted file mode 100644
index d9135d5c0c21..000000000000
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ /dev/null
@@ -1,603 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import configparser
-import logging
-import math
-import multiprocessing
-import os
-import typing
-from collections import defaultdict
-from pathlib import Path
-from typing import Union
-
-import numpy as np
-import tensorstore  # This is important even though not used. Otherwise zarr raises error.
-import torch
-import zarr
-from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy
-from tqdm import tqdm
-from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
-
-from nemo.export.tarutils import TarPath, ZarrPathStore
-from nemo.export.trt_llm.nemo.convert import save_weight_torch, split_and_save_weight
-from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
-
-LOGGER = logging.getLogger("NeMo")
-
-layer_names = {
-    "position_embedding": "embedding.position_embeddings.weight",
-    "word_embedding": "embedding.word_embeddings.weight",
-    "output_layer": "output_layer.weight",
-    "final_layernorm.weight": "final_layernorm.weight",
-    "final_layernorm.bias": "final_layernorm.bias",
-}
-
-
-def get_layer_name(layer_type: str, prefix: str):
-    layer_dict = layer_names
-    if layer_type in layer_dict:
-        return prefix + layer_dict[layer_type]
-    else:
-        raise ValueError(f"Unknown layer type {layer_type}")
-
-
-def get_layer_prefix(layer_names, is_mcore):
-    transformer_layer_prefix = None
-
-    for layer_name in layer_names:
-        if 'self_attention' in layer_name:
-            transformer_layer_prefix = layer_name.split('layers')[0]
-            break
-    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
-    if is_mcore:
-        model_prefix = transformer_layer_prefix.split('decoder')[0]
-    else:
-        model_prefix = transformer_layer_prefix.split('encoder')[0]
-    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
-
-    return model_prefix, transformer_layer_prefix
-
-
-def get_layer_index(split_key):
-    index = 0
-    for key in split_key:
-        if key == "layers":
-            return index + 1
-        index += 1
-
-
-def rename_key(old_key: str, pp_rank: int, num_layers: int, pp_size: int):
-    new_key = old_key
-
-    if "layers." in old_key:
-        split_key = old_key.split(".")
-        layer_index = get_layer_index(split_key)
-        split_key[layer_index] = str(int(split_key[layer_index]) + pp_rank * num_layers // pp_size)
-        new_key = ".".join(split_key)
-
-        if "self_attention" in new_key:
-            new_key = new_key.replace("self_attention", "attention")
-        if "attention.linear_qkv.layer_norm_weight" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
-        if "mlp.linear_fc1.layer_norm_weight" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
-
-    return new_key
-
-
-def rename_key_dist_ckpt(old_key: str, layer: int):
-    new_key = old_key
-
-    if "layers." in old_key:
-        split_key = old_key.split(".")
-        split_key.insert(1, str(layer))
-        new_key = ".".join(split_key)
-
-        if "self_attention" in new_key:
-            new_key = new_key.replace("self_attention", "attention")
-        if "attention.linear_qkv.layer_norm_weight" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
-        if "attention.linear_qkv.layer_norm_bias" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
-        if "mlp.linear_fc1.layer_norm_weight" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
-        if "mlp.linear_fc1.layer_norm_bias" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
-
-    return new_key
-
-
-def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
-    sharded_state_dict = {}
-    for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir() or not (subdir / '.zarray').exists():
-            continue
-        key = subdir.name
-
-        zstore = ZarrPathStore(subdir)
-        arr = zarr.open(zstore, 'r')
-
-        if torch_tensor:
-            # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16)
-            if arr.dtype.name == "bfloat16":
-                sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
-            else:
-                sharded_state_dict[key] = torch.from_numpy(arr[:])
-        else:
-            sharded_state_dict[key] = arr[:]
-
-    return sharded_state_dict
-
-
-@torch.no_grad()
-def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir, args):
-    nemo_model_config = unpacked_checkpoints_dir.model_config
-    checkpoints_path = unpacked_checkpoints_dir.checkpoints_dir / "model_weights"
-
-    # if checkpoints files could be found - start preparing output dir
-    out_dir = create_out_dir(args)
-
-    storage_type = str_dtype_to_torch(args.storage_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-
-    # load position_embedding from rank 0
-    model = load_sharded_metadata(checkpoints_path)
-    model_state_dict = model.get("state_dict", model)
-
-    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
-
-    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
-    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
-    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
-    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
-    hidden_size = nemo_model_config["hidden_size"]
-
-    num_layers = nemo_model_config["num_layers"]
-    training_tp_size = 1
-    training_pp_size = 1
-    inference_tp_size = args.tensor_parallelism
-    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
-    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-    num_attention_heads = nemo_model_config["num_attention_heads"]
-    kv_channels = nemo_model_config.get("kv_channels", None)
-    if num_kv_heads == 0:
-        if multi_query_mode:
-            num_kv_heads = 1
-        else:
-            num_kv_heads = num_attention_heads
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
-        "tp_size": training_tp_size,
-        "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
-        and (args.decoder_type == "gptnext" or is_mcore),
-        "num_attention_heads": num_attention_heads,
-        "num_kv_heads": num_kv_heads,
-        "kv_channels": kv_channels,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-    }
-
-    # split_factor: in how many parts a TP training node is split
-    split_factor = inference_tp_size
-    model_level_weights = defaultdict(list)
-
-    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
-        if tp_idx == 0 and pp_idx == 0:
-            if has_position_embedding:
-                val = model[get_layer_name("position_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["model.wpe.bin"].append(val)
-        if pp_idx == 0:
-            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-            if embedding_scaling:
-                val = val * float(math.sqrt(hidden_size))
-
-            val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["model.wte.bin"].append(val)
-            if share_embeddings_and_output:
-                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["model.lm_head.weight.bin"].append(val)
-        if has_lm_head and pp_idx == training_pp_size - 1:
-            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
-            val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["model.lm_head.weight.bin"].append(val)
-
-    weights_dict = {}
-
-    tp_rank = 0
-
-    handle_model_level_weights(model, 0, 0)
-    model = extract_layers_with_prefix(model, transformer_layer_prefix)
-
-    starmap_args = []
-    for key, val in model.items():
-        if "_extra_state" not in key:
-            if len(val.size()) == 1:
-                starmap_args.append(
-                    (
-                        tp_rank,
-                        out_dir,
-                        split_factor,
-                        # Let's rename/map the key to the old layer name previously. You can try printing out
-                        # the rename_key output of the old llama checkpoint and compare.
-                        rename_key_dist_ckpt(key, 0),
-                        # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
-                        [val],
-                        storage_type,
-                        None,
-                        export_config,
-                    )
-                )
-            else:
-                for i in range(num_layers):
-                    starmap_args.append(
-                        (
-                            tp_rank,
-                            out_dir,
-                            split_factor,
-                            # Let's rename/map the key to the old layer name previously. You can try printing out
-                            # the rename_key output of the old llama checkpoint and compare.
-                            rename_key_dist_ckpt(key, i),
-                            # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
-                            [val[i]],
-                            storage_type,
-                            None,
-                            export_config,
-                        )
-                    )
-
-    starmap_args = tqdm(starmap_args, desc="saving weights")
-
-    if args.processes > 1:
-        with multiprocessing.Pool(args.processes) as pool:
-            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
-            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
-    else:
-        # simpler for debug situations
-        for starmap_arg in starmap_args:
-            weights_dict_local = split_and_save_weight(*starmap_arg)
-
-    weights_dict.update(weights_dict_local)
-
-    for key, values in model_level_weights.items():
-        model_level_weights[key] = np.concatenate(values, axis=0)
-
-        weights_dict[key] = model_level_weights[key]
-    vocab_size = model_level_weights["model.wte.bin"].shape[0]
-
-    if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
-        tokenizer = AutoTokenizer.from_pretrained(
-            nemo_model_config["tokenizer"]["type"], use_fast=nemo_model_config["tokenizer"].get("use_fast", False)
-        )
-    else:
-        tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoints_dir)
-        copy_tokenizer_files(tokenizer_config, out_dir)
-
-        tokenizer_config["model"] = os.path.join(out_dir, "tokenizer.model")
-        tokenizer = build_tokenizer(tokenizer_config)
-
-    llm_config = nemo_to_llm_config(
-        nemo_model_config, vocab_size, tokenizer.eos_token_id, tokenizer.bos_token_id, args.decoder_type,
-    )
-
-    llm_config.is_mcore = is_mcore
-
-    config = configparser.ConfigParser()
-    decoder_name_dict = {"llama": "llama", "falcon": "falcon"}
-    model_name = decoder_name_dict[args.decoder_type] if args.decoder_type in decoder_name_dict else "gpt"
-
-    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
-    config[model_name]["storage_dtype"] = args.storage_type
-    config_path = out_dir / "config.ini"
-    with config_path.open("w") as config_file:
-        config.write(config_file)
-
-    return weights_dict, llm_config, tokenizer
-
-
-@torch.no_grad()
-def convert_nemo_model(nemo_model, nemo_model_config, storage_type_str, decoder_type=None):
-    from megatron.core import parallel_state
-
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-
-    nemo_model_state_dict = nemo_model.state_dict()
-    prefix, transformer_layer_prefix = get_layer_prefix(nemo_model_state_dict, is_mcore)
-    has_position_embedding = get_layer_name("position_embedding", prefix) in nemo_model_state_dict
-    has_lm_head = get_layer_name("output_layer", prefix) in nemo_model_state_dict
-    has_final_layer_bias = get_layer_name("final_layernorm.bias", transformer_layer_prefix) in nemo_model_state_dict
-
-    tp_rank = parallel_state.get_tensor_model_parallel_rank()
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-    pp_group = parallel_state.get_pipeline_model_parallel_group()
-    # split_factor = 1
-    storage_type = str_dtype_to_torch(storage_type_str)
-
-    num_layers = nemo_model_config["num_layers"]
-    training_tp_size = nemo_model_config.get("tensor_model_parallel_size", 1)
-    training_pp_size = nemo_model_config.get("pipeline_model_parallel_size", 1)
-    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
-    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-    num_attention_heads = nemo_model_config["num_attention_heads"]
-
-    # pp currently unsupported so reshard away PP
-    is_pp_resharding = False
-    if pp_size > 1:
-        is_pp_resharding = True
-
-    if num_kv_heads == 0:
-        if multi_query_mode:
-            num_kv_heads = 1
-        else:
-            num_kv_heads = num_attention_heads
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
-        "tp_size": training_tp_size,
-        "split_gated_activation": "swiglu" in nemo_model_config.get("activation", "gelu")
-        and (decoder_type == "gptnext" or is_mcore),
-        "num_attention_heads": nemo_model_config["num_attention_heads"],
-        "num_kv_heads": num_kv_heads,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-        "from_nemo_model": True,
-    }
-
-    # Gather meta data from first and last PP stage
-    if is_pp_resharding:
-        has_lm_head = torch.tensor(has_lm_head).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
-        torch.distributed.broadcast(has_lm_head, src_rank, group=pp_group)
-        has_lm_head = has_lm_head.item()
-
-        has_position_embedding = torch.tensor(has_position_embedding).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, 0)
-        torch.distributed.broadcast(has_position_embedding, src_rank, group=pp_group)
-        has_position_embedding = has_position_embedding.item()
-
-        has_final_layer_bias = torch.tensor(has_final_layer_bias).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
-        torch.distributed.broadcast(has_final_layer_bias, src_rank, group=pp_group)
-        has_final_layer_bias = has_final_layer_bias.item()
-
-    trt_inflight_weights = {}
-    starmap_args = []
-
-    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
-        def _handle_weights(src_key: str, dst_key: str, pp_src_idx: int, tensor_dim: int):
-            src_pp_global_rank = torch.distributed.get_global_rank(pp_group, pp_src_idx)
-            # Broadcast the shape
-            if pp_idx == pp_src_idx:
-                gathered_tensor = model.get("state_dict", model)[src_key].type(storage_type).cuda()
-                shape = torch.IntTensor(list(gathered_tensor.shape)).cuda()
-            else:
-                shape = torch.zeros(tensor_dim, dtype=torch.int32).cuda()
-            torch.distributed.broadcast(shape, src_pp_global_rank, group=pp_group)
-
-            # Collect the tensor
-            if pp_idx != pp_src_idx:
-                gathered_tensor = torch.zeros(*shape, dtype=storage_type).cuda()
-            torch.distributed.broadcast(gathered_tensor, src_pp_global_rank, group=pp_group)
-
-            if "final_layernorm" not in src_key:
-                gathered_tensor = gathered_tensor.to(storage_type).cpu()
-                trt_inflight_weights[dst_key] = torch_to_numpy(gathered_tensor)
-            else:
-                starmap_args.append(
-                    {
-                        "tp_rank": tp_idx,
-                        "saved_dir": trt_inflight_weights,
-                        "split_factor": 1,
-                        "key": dst_key,
-                        "vals": [gathered_tensor],
-                        "storage_type": storage_type,
-                        "act_range": None,
-                        "config": export_config,
-                    }
-                )
-
-        if has_lm_head:
-            _handle_weights(get_layer_name("output_layer", prefix), "model.lm_head.weight.bin", pp_size - 1, 2)
-        if has_position_embedding:
-            _handle_weights(get_layer_name("position_embedding", prefix), "model.wpe.bin", 0, 2)
-
-        _handle_weights(get_layer_name("word_embedding", prefix), "model.wte.bin", 0, 2)
-        _handle_weights(
-            get_layer_name("final_layernorm.weight", transformer_layer_prefix),
-            "final_layernorm.weight",
-            pp_size - 1,
-            1,
-        )
-
-        if has_final_layer_bias:
-            _handle_weights(
-                get_layer_name("final_layernorm.bias", transformer_layer_prefix),
-                "final_layernorm.bias",
-                pp_size - 1,
-                1,
-            )
-
-        torch.cuda.empty_cache()
-
-    models = []
-
-    handle_model_level_weights(nemo_model_state_dict, tp_rank, pp_rank)
-    layers = extract_layers_with_prefix(nemo_model_state_dict, transformer_layer_prefix)
-    models.append(layers)
-
-    for key in models[0].keys():
-        # Skip final_layernorm.
-        if not key.startswith("layers."):
-            continue
-        if "_extra_state" not in key:
-            starmap_args.append(
-                {
-                    "tp_rank": tp_rank,
-                    "saved_dir": trt_inflight_weights,
-                    "split_factor": 1,
-                    "key": rename_key(key, pp_rank, num_layers, training_pp_size),
-                    "vals": [model[key] for model in models],
-                    "storage_type": storage_type,
-                    "act_range": None,
-                    "config": export_config,
-                }
-            )
-    starmap_args = tqdm(starmap_args, desc="saving weights")
-    for starmap_arg in starmap_args:
-        save_weight_torch(**starmap_arg)
-
-    # Collect weights from different pp stages
-    # Assume each rank has the same number of layers
-    if is_pp_resharding:
-        collect_pp_weights = {}
-        for key, val in trt_inflight_weights.items():
-            # Skip embedding and final layer
-            if not key.startswith("model.layers"):
-                continue
-            # Convert numpy array to torch tensor and gather weights
-            curr_weight = trt_inflight_weights[key]
-            if curr_weight.dtype != np_bfloat16:
-                curr_weight = torch.tensor(curr_weight).cuda()
-            else:
-                curr_weight = torch.tensor(curr_weight.view(np.int16)).view(torch.bfloat16).cuda()
-            weight_list = [torch.zeros_like(curr_weight) for _ in range(pp_size)]
-            torch.distributed.all_gather(weight_list, curr_weight, group=pp_group)
-            # Collect weights name
-            for rank in range(pp_size):
-                split_key = key.split(".")
-                layer_index = get_layer_index(split_key)
-                split_key[layer_index] = str(int(split_key[layer_index]) + (rank - pp_rank) * num_layers // pp_size)
-                new_key = ".".join(split_key)
-                collect_pp_weights[new_key] = torch_to_numpy(weight_list[rank].to(storage_type).cpu())
-
-        trt_inflight_weights.update(collect_pp_weights)
-
-    vocab_size = trt_inflight_weights["model.wte.bin"].shape[0] * tp_size
-
-    llm_config = nemo_to_llm_config(
-        nemo_model_config,
-        vocab_size,
-        None,
-        None,
-        decoder_type=decoder_type,  # how to get eos_id and bos_id from different tokenizer?
-    )
-    llm_config.is_mcore = is_mcore
-
-    config = configparser.ConfigParser()
-    model_name = "llama" if isinstance(llm_config, LlamaConfig) else "gpt"
-    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
-    config[model_name]["storage_dtype"] = storage_type_str
-
-    return trt_inflight_weights, llm_config
-
-
-def create_out_dir(args):
-    out_dir = Path(args.out_dir)
-    if not out_dir.exists():
-        out_dir.mkdir(parents=True)
-    return out_dir
-
-
-def update_tokenizer_paths(tokenizer_config: typing.Dict, unpacked_checkpoints_dir):
-    def _update_config_entry(key, file_pattern):
-        old_path = tokenizer_config[key]
-        if old_path is None:
-            return
-        old_path = Path(old_path)
-        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
-        if new_path:
-            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
-            tokenizer_config[key] = new_path
-        elif not old_path.exists():
-            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
-            tokenizer_config[key] = None
-
-    _update_config_entry("model", "*.model")
-    _update_config_entry("vocab_file", "*vocab*")
-    _update_config_entry("merge_file", "*merge*.txt")
-
-    return tokenizer_config
-
-
-def copy_tokenizer_files(config, out_dir):
-    basenames = {
-        "model": "tokenizer",
-        "vocab_file": "vocab",
-        "merge_file": "merges",
-    }
-
-    for key in basenames.keys():
-        if config[key] is None:
-            continue
-
-        path = config[key]
-
-        if isinstance(path, str):
-            path = Path(path)
-
-        if not path.exists():
-            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
-            continue
-
-        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
-        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
-
-        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
-        with path.open('rb') as infile:
-            with open(dst_path, 'wb') as outfile:
-                outfile.write(infile.read())
-
-
-def build_tokenizer(tokenizer):
-    if isinstance(tokenizer, dict):
-        tokenizer_config = tokenizer
-        if tokenizer_config["library"] == "sentencepiece":
-            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
-        elif "GPT2" in tokenizer_config["type"]:
-            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
-        else:
-            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
-
-        if tokenizer.bos_token_id is None:
-            tokenizer.add_special_tokens({"bos_token": "<s>"})
-        if tokenizer.eos_token_id is None:
-            tokenizer.add_special_tokens({"eos_token": "</s>"})
-    else:
-        try:
-            # If NeMo tokenizer, monkey patch interface
-            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
-            if isinstance(tokenizer, TokenizerSpec):
-
-                def batch_encode_patch(self, ids):
-                    if torch.is_tensor(ids):
-                        ids = ids.cpu().numpy()
-                    return self.ids_to_text(ids)
-
-                tokenizer.bos_token_id = tokenizer.bos_id
-                tokenizer.eos_token_id = tokenizer.eos_id
-                tokenizer.encode = tokenizer.text_to_ids
-                TokenizerSpec.batch_decode = batch_encode_patch
-        except:
-            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
-
-    return tokenizer
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
new file mode 100644
index 000000000000..c9c6f65d27e0
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
new file mode 100644
index 000000000000..09eae628999a
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -0,0 +1,406 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, Union
+
+import numpy as np
+import tensorstore  # This is important even though not used. Otherwise zarr raises error.
+import torch
+import yaml
+import zarr
+from torch.distributed.checkpoint import FileSystemReader
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from nemo.export.tarutils import TarPath, ZarrPathStore
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def is_nemo_file(path):
+    flag = False
+
+    if path is not None:
+        if len(path) > 5:
+            pc = Path(path)
+            if pc.exists():
+                if pc.is_file():
+                    if path[-5 : len(path)] == ".nemo":
+                        flag = True
+
+    return flag
+
+
+class TarFileSystemReader(FileSystemReader):
+    """Reader that accepts both Path and TarPath checkpoint directory.
+
+    The FileSystemReader works with TarPath, but expects a pure Path.
+    It's enough to skip the Path check in __init__.
+    """
+
+    def __init__(self, path: Union[Path, TarPath]) -> None:
+        """No call to super().__init__ because it expects pure Path."""
+        self.path = path
+        self.storage_data = dict()
+
+
+def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    fs_reader = TarFileSystemReader(checkpoint_dir)
+    metadata = fs_reader.read_metadata()
+
+    state_dict = {
+        k: torch.empty(tp.size, dtype=tp.properties.dtype)
+        for k, tp in metadata.state_dict_metadata.items()
+        if isinstance(tp, TensorStorageMetadata)
+    }
+    load_state_dict(
+        state_dict,
+        storage_reader=fs_reader,
+        no_dist=True,
+    )
+
+    if not torch_tensor:
+        for k, v in state_dict.items():
+            if v.dtype == torch.bfloat16:
+                state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
+            else:
+                state_dict[k] = v.numpy()
+    return state_dict
+
+
+def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    sharded_state_dict = {}
+    for subdir in checkpoint_dir.iterdir():
+        if not subdir.is_dir() or not (subdir / '.zarray').exists():
+            continue
+        key = subdir.name
+
+        zstore = ZarrPathStore(subdir)
+        arr = zarr.open(zstore, 'r')
+
+        if torch_tensor:
+            # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16)
+            if arr.dtype.name == "bfloat16":
+                sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
+            else:
+                sharded_state_dict[key] = torch.from_numpy(arr[:])
+        else:
+            sharded_state_dict[key] = arr[:]
+
+    return sharded_state_dict
+
+
+def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    with (checkpoint_dir / 'metadata.json').open(mode='r') as f:
+        config_dict = json.load(f)
+    if config_dict['sharded_backend'] == 'zarr':
+        return load_sharded_metadata_zarr(checkpoint_dir, torch_tensor)
+    elif config_dict['sharded_backend'] == 'torch_dist':
+        return load_sharded_metadata_torch_dist(checkpoint_dir, torch_tensor)
+    else:
+        raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
+
+
+def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
+    def _update_config_entry(key, file_pattern):
+        old_path = tokenizer_config[key]
+        if old_path is None:
+            return
+        old_path = Path(old_path)
+        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
+        if new_path:
+            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
+            tokenizer_config[key] = new_path
+        elif not old_path.exists():
+            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
+            tokenizer_config[key] = None
+
+    _update_config_entry("model", "*.model")
+    _update_config_entry("vocab_file", "*vocab*")
+    _update_config_entry("merge_file", "*merge*.txt")
+
+    return tokenizer_config
+
+
+def copy_tokenizer_files(config, out_dir):
+    basenames = {
+        "model": "tokenizer",
+        "vocab_file": "vocab",
+        "merge_file": "merges",
+    }
+
+    for key in basenames.keys():
+        if config[key] is None:
+            continue
+
+        path = config[key]
+
+        if isinstance(path, str):
+            path = Path(path)
+
+        if not path.exists():
+            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
+            continue
+
+        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
+        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
+
+        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
+        with path.open('rb') as infile:
+            with open(dst_path, 'wb') as outfile:
+                outfile.write(infile.read())
+
+
+def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
+    """Loads the tokenizer from the decoded NEMO weights dir."""
+    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
+        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
+
+    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
+    return build_tokenizer(tokenizer_config)
+
+
+def build_tokenizer(tokenizer):
+    if isinstance(tokenizer, dict):
+        tokenizer_config = tokenizer
+        if tokenizer_config["library"] == "sentencepiece":
+            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
+        elif "GPT2" in tokenizer_config["type"]:
+            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
+        else:
+            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
+
+        if tokenizer.bos_token_id is None:
+            tokenizer.add_special_tokens({"bos_token": "<s>"})
+        if tokenizer.eos_token_id is None:
+            tokenizer.add_special_tokens({"eos_token": "</s>"})
+    else:
+        try:
+            # If NeMo tokenizer, monkey patch interface
+            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+            if isinstance(tokenizer, TokenizerSpec):
+
+                def batch_encode_patch(self, ids):
+                    if torch.is_tensor(ids):
+                        ids = ids.cpu().numpy()
+                    return self.ids_to_text(ids)
+
+                tokenizer.bos_token_id = tokenizer.bos_id
+                tokenizer.eos_token_id = tokenizer.eos_id
+                tokenizer.encode = tokenizer.text_to_ids
+                TokenizerSpec.batch_decode = batch_encode_patch
+        except:
+            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
+
+    return tokenizer
+
+
+def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path]):
+
+    if not os.path.exists(nemo_ckpt):
+        raise TypeError("%s does not exist", nemo_ckpt)
+
+    if os.path.isdir(nemo_ckpt):
+        nemo_dir = Path(nemo_ckpt)
+    else:
+        nemo_dir = TarPath(nemo_ckpt)
+
+    try:
+        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_dir, load_checkpoints_to_cpu=True)
+
+        dist_ckpt_folder = nemo_dir / "model_weights"
+        if dist_ckpt_folder.exists():
+            model = load_sharded_metadata(dist_ckpt_folder)
+            nemo_model_config = unpacked_checkpoint_dir.model_config
+
+            if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
+                tokenizer = AutoTokenizer.from_pretrained(
+                    nemo_model_config["tokenizer"]["type"],
+                    use_fast=nemo_model_config["tokenizer"].get("use_fast", False),
+                )
+            else:
+                tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
+                copy_tokenizer_files(tokenizer_config, nemo_export_dir)
+
+                tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
+                tokenizer = build_tokenizer(tokenizer_config)
+        else:
+            raise Exception(
+                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
+            )
+    finally:
+        if isinstance(nemo_dir, TarPath):
+            nemo_dir.tarobject.close()
+
+    return model, nemo_model_config, tokenizer
+
+
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
+class UnpackedNemoCheckpointDir:
+    def __init__(
+        self,
+        checkpoints_dir: Union[Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
+    ):
+        assert isinstance(checkpoints_dir, (Path, TarPath))
+        self._checkpoints_dir = checkpoints_dir
+        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
+
+    @property
+    @functools.lru_cache
+    def model_config(self):
+        model_config = None
+
+        model_config_filename = "model_config.yaml"
+        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
+        if model_configs_paths:
+            if len(model_configs_paths) > 1:
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
+            model_config_path = model_configs_paths[0]
+            LOGGER.debug("Loading model config from %s", model_config_path)
+            with model_config_path.open("r") as model_config_file:
+                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+        else:
+            LOGGER.debug("Searching model config in checkpoints")
+            # try to obtain from checkpoint
+            checkpoint_name = self.checkpoint_name
+            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
+            if checkpoints_paths:
+                # assume that parallel ranks 0 checkpoint should have model config embedded
+                checkpoint_path = checkpoints_paths[0]
+
+                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
+
+                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
+                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
+                    model_config = model_00["hyper_parameters"]["cfg"]
+                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
+                else:
+                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
+
+                del model_00
+
+        if model_config is None:
+            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
+
+        LOGGER.debug("Loaded model config %s", model_config)
+
+        return model_config
+
+    @property
+    def checkpoints_dir(self):
+        return self._checkpoints_dir
+
+    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+        """Injects tensor/pipeline model parallel ranks into the filepath.
+        Does nothing if not using model parallelism.
+        """
+        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
+
+        def _inject_parallel_ranks(tp_rank, pp_rank):
+            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
+                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"mp_rank_{tp_rank:02d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                else:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                return checkpoint_path
+            else:
+                return checkpoint_path_without_rank
+
+        return [
+            [
+                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
+                for pp_rank in range(pipeline_model_parallel_size)
+            ]
+            for tp_rank in range(tensor_model_parallel_size)
+        ]
+
+    @property
+    @functools.lru_cache
+    def checkpoint_name(self):
+        patterns = [
+            "model_weights.ckpt",  # older megatron checkpoints
+            "*last.ckpt",  # newer format of checkpoints
+        ]
+        for pattern in patterns:
+            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
+            if model_files:
+                return model_files[0].name
+
+        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
+
+    @functools.lru_cache
+    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
+        model_config = self.model_config
+        file_property = None
+        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
+            file_property = model_config[tokenizer_key][file_key]
+        elif file_key in model_config:
+            file_property = model_config[file_key]
+
+        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
+
+        if file_property and file_property.startswith("nemo:"):
+            filename = file_property.split("nemo:")[1]
+            filename_pattern = f"*{filename}"
+        elif file_property and file_property.startswith("/artifacts/"):
+            filename = Path(file_property).name
+            filename_pattern = f"*{filename}"
+        elif file_property is None or file_property == "None":
+            filename_pattern = None
+        else:
+            filename_pattern = default_filename_pattern
+            LOGGER.warning(
+                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
+                f"looks like unsupported path. Pattern {filename_pattern} will be used."
+            )
+
+        file_path = None
+        if filename_pattern is not None:
+            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
+            if files_paths:
+                assert len(files_paths) == 1
+                file_path = files_paths[0]
+
+        return file_path
diff --git a/nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py b/nemo/export/trt_llm/nemo_ckpt_loader/sentencepiece_tokenizer.py
similarity index 100%
rename from nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py
rename to nemo/export/trt_llm/nemo_ckpt_loader/sentencepiece_tokenizer.py
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
deleted file mode 100644
index fdcad3279213..000000000000
--- a/nemo/export/trt_llm/nemo_utils.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import copy
-import csv
-import datetime
-import logging
-import os
-import shutil
-import sys
-import tempfile
-from pathlib import Path
-from typing import Dict, List, Tuple, Union
-
-import numpy as np
-import tensorrt_llm
-from tensorrt_llm import str_dtype_to_trt
-from transformers import AutoTokenizer, LlamaConfig, PretrainedConfig, PreTrainedTokenizer
-
-from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.model_config import (
-    LAYERNORM_DEFAULT,
-    LAYERNORM_RMS,
-    LINEAR_COLUMN,
-    DecoderLayerConfig,
-    EmbeddingConfig,
-    LayernormConfig,
-    LinearConfig,
-    ModelConfig,
-)
-from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def _nemo_llm_decode(
-    in_file: str,
-    out_dir: str,
-    tensor_parallelism: int = 1,
-    processes: int = 1,
-    storage_type: str = "bfloat16",
-    load_checkpoints_on_gpu: bool = False,
-    decoder_type: str = "gptnext",
-    save_nemo_model_config: bool = False,
-) -> Tuple[Dict[str, np.ndarray], PretrainedConfig, PreTrainedTokenizer]:
-    """Decodes the NEMO file and returns the weights dict, llm config and tokenizer."""
-    args = argparse.Namespace()
-    args.out_dir = out_dir
-    args.tensor_parallelism = tensor_parallelism
-    args.processes = processes
-    args.storage_type = storage_type
-    args.load_checkpoints_on_gpu = load_checkpoints_on_gpu
-    args.verbose = False
-    args.decoder_type = decoder_type
-
-    if not os.path.exists(in_file):
-        LOGGER.error("%s does not exist", in_file)
-        sys.exit(1)
-
-    if os.path.isdir(in_file):
-        nemo_dir = Path(in_file)
-    else:
-        nemo_dir = TarPath(in_file)
-
-    try:
-        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(
-            nemo_dir, load_checkpoints_to_cpu=not args.load_checkpoints_on_gpu
-        )
-
-        start_time = datetime.datetime.now()
-        dist_ckpt_folder = nemo_dir / "model_weights"
-
-        if dist_ckpt_folder.exists():
-            weights_dict, llm_config, tokenizer = convert_dist_checkpoint(unpacked_checkpoint_dir, args)
-        else:
-            raise Exception(
-                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
-            )
-
-        LOGGER.info("Spent %s (h:m:s) to convert the model", datetime.datetime.now() - start_time)
-
-        if save_nemo_model_config:
-            # Copy the config file without using shutil.copy(...) because input may be a TarPath
-            with (unpacked_checkpoint_dir._checkpoints_dir / "model_config.yaml").open("rb") as infile:
-                with open(os.path.join(args.out_dir, "model_config.yaml"), "wb") as outfile:
-                    outfile.write(infile.read())
-    finally:
-        if isinstance(nemo_dir, TarPath):
-            nemo_dir.tarobject.close()
-
-    return weights_dict, llm_config, tokenizer
-
-
-def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
-    """Loads the tokenizer from the decoded NEMO weights dir."""
-    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
-        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
-
-    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
-    return build_tokenizer(tokenizer_config)
-
-
-def nemo_llm_to_model_config(
-    in_file: str,
-    decoder_type: str,
-    nemo_export_dir: Union[str, Path],
-    dtype: str = "bfloat16",
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    save_nemo_model_config: bool = False,
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment."""
-    dtype_str = dtype
-
-    weights_dict, llm_model_config, tokenizer = _nemo_llm_decode(
-        in_file=in_file,
-        out_dir=nemo_export_dir,
-        tensor_parallelism=tensor_parallel_size,
-        processes=1,
-        storage_type=dtype_str,
-        load_checkpoints_on_gpu=False,
-        decoder_type=decoder_type,
-        save_nemo_model_config=save_nemo_model_config,
-    )
-
-    world_size = tensor_parallel_size * pipeline_parallel_size
-    model_config_template = ModelConfig()
-    model_config_template.dtype = dtype_str
-
-    str_dtype_to_trt(dtype_str)
-
-    model_configs = []
-    for i in range(world_size):
-
-        model_configs.append(copy.deepcopy(model_config_template))
-
-        model_configs[i].vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"))
-
-        model_configs[i].positional_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wpe"))
-
-        model_configs[i].final_layernorm = LayernormConfig(
-            weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-            bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-        )
-        model_configs[i].final_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        model_configs[i].mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
-        )
-
-    for i in range(llm_model_config.n_layer):
-        for j in range(world_size):
-            model_configs[j].layers.append(
-                DecoderLayerConfig.from_nemo(
-                    weights_dict=weights_dict,
-                    llm_config=llm_model_config,
-                    decoder_type=decoder_type,
-                    layer_id=i,
-                    rank=model_configs[j].mapping.tp_rank,
-                    is_mcore=llm_model_config.is_mcore,
-                )
-            )
-
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    if model_configs[0].vocab_size_padded != model_configs[0].vocab_size:
-        pad_width = model_configs[0].vocab_size_padded - model_configs[0].vocab_size
-        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
-
-    for i in range(world_size):
-        model_configs[i].lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-        model_configs[i].lm_head.weight = np.ascontiguousarray(
-            split(lm_head_weight, model_configs[i].mapping.tp_size, model_configs[i].mapping.tp_rank)
-        )
-
-    return model_configs, tokenizer
-
-
-def to_word_list_format(word_dict: List[List[str]], tokenizer=None):
-    '''
-    format of word_dict
-        len(word_dict) should be same to batch_size
-        word_dict[i] means the words for batch i
-        len(word_dict[i]) must be 1, which means it only contains 1 string
-        This string can contains several sentences and split by ",".
-        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-        the ids for two short sentences " I am happy" and " I am sad".
-    '''
-    assert tokenizer is not None, "need to set tokenizer"
-
-    flat_ids = []
-    offsets = []
-    # We use a similar trick as in NeMo to deal with the fact that the encoding of a single word
-    # can't always be trusted. See
-    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
-    ids_ref = tokenizer.encode("<extra_id_1>")
-    for word_dict_item in word_dict:
-        item_flat_ids = []
-        item_offsets = []
-
-        if isinstance(word_dict_item[0], bytes):
-            word_dict_item = [word_dict_item[0].decode()]
-
-        words = list(csv.reader(word_dict_item))[0]
-        for word in words:
-            ids = tokenizer.encode(f"<extra_id_1>{word}")
-            if ids[0 : len(ids_ref)] == ids_ref:
-                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
-                ids = ids[len(ids_ref) :]
-            else:
-                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
-                # for now we just use the basic encoding since this should be a very rare edge case.
-                ids = tokenizer.encode(word)
-                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
-
-            if len(ids) == 0:
-                continue
-
-            item_flat_ids += ids
-            item_offsets.append(len(ids))
-
-        flat_ids.append(np.array(item_flat_ids))
-        offsets.append(np.cumsum(np.array(item_offsets)))
-
-    pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
-
-    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
-
-
-def nemo_llm_model_to_model_config(
-    nemo_model: str, decoder_type: str, nemo_model_config: str, dtype_str: str = "float32",
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment."""
-    from megatron.core import parallel_state
-
-    assert nemo_model_config is not None, "gpt_model_config must be provided when in is a nemo model"
-
-    weights_dict, llm_model_config = convert_nemo_model(nemo_model, nemo_model_config, dtype_str, decoder_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-    llm_model_config.is_mcore = is_mcore
-
-    model_config = ModelConfig()
-    model_config.use_prompt_tuning = False
-    model_config.dtype = dtype_str
-    model_config.use_parallel_embedding = True
-    str_dtype_to_trt(dtype_str)
-
-    model_config.vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"), is_local=True)
-
-    model_config.positional_embedding = EmbeddingConfig(
-        weight=get_tensor_from_dict(weights_dict, "wpe"), is_local=True
-    )
-
-    model_config.final_layernorm = LayernormConfig(
-        weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-        bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-    )
-    model_config.final_layernorm.layernorm_type = (
-        LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-    )
-
-    tensor_parallel_size = nemo_model_config.tensor_model_parallel_size
-    pipeline_parallel_size = 1
-    world_size = tensor_parallel_size * pipeline_parallel_size
-
-    # hack since tensorrt_llm doesnt support DP natively so init all ranks with DP=1
-    model_config.mapping = tensorrt_llm.Mapping(
-        world_size=tensor_parallel_size * pipeline_parallel_size,
-        rank=tensorrt_llm.mpi_rank() % world_size,
-        tp_size=tensor_parallel_size,
-        pp_size=pipeline_parallel_size,
-    )
-    model_config.mapping.rank = tensorrt_llm.mpi_rank()
-    model_config.mapping.tp_group = get_tensor_parallel_group(tensor_parallel_size)
-
-    LOGGER.info(
-        f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping:
-        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, 
-        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, 
-        tp_group {model_config.mapping.tp_group}'''
-    )
-
-    for i in range(llm_model_config.n_layer):
-        model_config.layers.append(
-            DecoderLayerConfig.from_nemo(
-                weights_dict=weights_dict,
-                llm_config=llm_model_config,
-                decoder_type=decoder_type,
-                layer_id=i,
-                rank=model_config.mapping.tp_rank,
-                is_mcore=llm_model_config.is_mcore,
-            )
-        )
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    assert model_config.vocab_size_padded == model_config.vocab_size
-
-    model_config.lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-    model_config.lm_head.weight = lm_head_weight
-
-    return [model_config]
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
index 77832d749b66..59b9eb8ae6a6 100644
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .align_config import align_config
 from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
diff --git a/nemo/export/trt_llm/qnemo/align_config.py b/nemo/export/trt_llm/qnemo/align_config.py
deleted file mode 100644
index abc53224e4b3..000000000000
--- a/nemo/export/trt_llm/qnemo/align_config.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from typing import Any, Dict
-
-
-def align_config(config_trtllm_build: Dict[str, Any]) -> Dict[str, Any]:
-    """Function to align config produced by trtllm-build API for consistency
-    with how ModelConfig from tensorrt_llm.runtime is used in the project.
-    """
-    config = {}
-
-    config_trtllm_build = copy.deepcopy(config_trtllm_build)
-
-    # Builder config
-    config["builder_config"] = {}
-    config["builder_config"]["name"] = "NeMo"
-    config["builder_config"].update(config_trtllm_build["build_config"])
-    config["builder_config"].update(config_trtllm_build["pretrained_config"])
-
-    # Plugin config
-    config["plugin_config"] = config["builder_config"].pop("plugin_config")
-
-    # Parallelism config
-    config["builder_config"]["world_size"] = config["builder_config"]["mapping"]["world_size"]
-    config["builder_config"]["tensor_parallel"] = config["builder_config"]["mapping"]["tp_size"]
-    config["builder_config"]["pipeline_parallel"] = config["builder_config"]["mapping"]["pp_size"]
-
-    # Other parameters
-    config["builder_config"]["num_heads"] = config_trtllm_build["pretrained_config"]["num_attention_heads"]
-    config["builder_config"]["num_layers"] = config_trtllm_build["pretrained_config"]["num_hidden_layers"]
-    config["builder_config"]["add_bos"] = False
-    config["builder_config"]["precision"] = config["builder_config"]["dtype"]
-    return config
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index 4e74d8e5fb58..b7e2f7bc2973 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -15,13 +15,10 @@
 import json
 import os
 import subprocess
-from typing import List, Optional
 
-from nemo.export.trt_llm.qnemo import align_config
-from nemo.export.trt_llm.tensorrt_llm_build import MODEL_NAME, get_engine_name
+from typing import List, Optional
 
 CONFIG_NAME = "config.json"
-CONFIG_TRTLLM_BUILD_NAME = "config_trtllm_build.json"
 
 
 def qnemo_to_tensorrt_llm(
@@ -34,6 +31,7 @@ def qnemo_to_tensorrt_llm(
     lora_target_modules: Optional[List[str]] = None,
 ):
     """Build TRT-LLM engine via trtllm-build CLI API in a subprocess."""
+    assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
     print(
         "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters"
         " for quantized models is possible only on export step via nemo.export.quantize module."
@@ -58,6 +56,8 @@ def qnemo_to_tensorrt_llm(
         str(max_prompt_embedding_table_size),
         "--gemm_plugin",
         model_config["dtype"],
+        "--gpt_attention_plugin",
+        model_config["dtype"],
         "--strongly_typed",
         "--use_custom_all_reduce",
         "disable",
@@ -75,35 +75,3 @@ def qnemo_to_tensorrt_llm(
 
     print("Building engine done. Full logs are:")
     print(result.stdout.decode())
-
-    # Alignment to make nemo-fw tensorrt_llm.runtime ModelConfig definition compatible with config
-    # produced by trtllm-build API. The new config is saved as "config.json" while the source build
-    # config is saved as "config_trtllm_build.json" in the engine directory for reference.
-    os.rename(os.path.join(engine_dir, CONFIG_NAME), os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME))
-    with open(os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME), "r") as f:
-        config_trtllm_build = json.load(f)
-
-    config = align_config(config_trtllm_build)
-
-    # Other parameters
-    assert lora_target_modules is None
-    config["builder_config"]["lora_target_modules"] = lora_target_modules
-
-    with open(os.path.join(engine_dir, CONFIG_NAME), "w") as f:
-        json.dump(config, f, indent=2)
-
-    # Rename for consistency with how engine is run later
-    for i in range(config["builder_config"]["world_size"]):
-        os.rename(
-            os.path.join(engine_dir, f"rank{i}.engine"),
-            os.path.join(
-                engine_dir,
-                get_engine_name(
-                    MODEL_NAME,
-                    config["builder_config"]["precision"],
-                    config["builder_config"]["tensor_parallel"],
-                    config["builder_config"]["pipeline_parallel"],
-                    i,
-                ),
-            ),
-        )
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
index 3fde26253af6..4b0775a0aa2a 100644
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -17,7 +17,7 @@
 from omegaconf import OmegaConf
 from transformers import AutoTokenizer
 
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
 
 # TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
 # from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
diff --git a/nemo/export/trt_llm/quantization_utils.py b/nemo/export/trt_llm/quantization_utils.py
deleted file mode 100644
index 86365f774bb7..000000000000
--- a/nemo/export/trt_llm/quantization_utils.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-from tensorrt_llm.layers import Linear, RowLinear
-from tensorrt_llm.quantization.layers import FP8Linear, FP8RowLinear, Int8SmoothQuantLinear, Int8SmoothQuantRowLinear
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_FP8,
-    QUANTIZATION_INT8_SQ,
-    QUANTIZATION_NONE,
-    LinearConfig,
-    ModelConfig,
-)
-
-
-def quantize_linear(tensorrt_llm_layer, quantization: str, layer_config: LinearConfig):
-    """Returns the quantized tensorrt_llm linear layer."""
-    if quantization == QUANTIZATION_NONE:
-        return tensorrt_llm_layer
-
-    if quantization == QUANTIZATION_FP8:
-        # FP8 is not sensitive to scaling factors. So we just quantize all layers possible.
-        default_scaling_factor = np.array([1], dtype=np.float32)
-        if layer_config.activation_scaling_factor is None:
-            layer_config.activation_scaling_factor = default_scaling_factor
-        if layer_config.weights_scaling_factor is None:
-            layer_config.weights_scaling_factor = default_scaling_factor
-
-    if layer_config.activation_scaling_factor is None or layer_config.weights_scaling_factor is None:
-        print(f"No valid scaling factors in {tensorrt_llm_layer._get_name()}, skipping quantization" " on this layer")
-        return tensorrt_llm_layer
-    else:
-        assert np.all(layer_config.activation_scaling_factor > 0)
-        assert np.all(layer_config.weights_scaling_factor > 0)
-
-    bias = tensorrt_llm_layer.bias is not None
-
-    linear_layer_type = type(tensorrt_llm_layer)
-    if linear_layer_type == Linear:
-        if quantization == QUANTIZATION_FP8:
-            linear = FP8Linear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            linear = Int8SmoothQuantLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = linear(
-            in_features=tensorrt_llm_layer.in_features,
-            out_features=tensorrt_llm_layer.out_features * tensorrt_llm_layer.tp_size,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-            gather_output=tensorrt_llm_layer.gather_output,
-        )
-    elif linear_layer_type == RowLinear:
-        if quantization == QUANTIZATION_FP8:
-            row_linear = FP8RowLinear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            row_linear = Int8SmoothQuantRowLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = row_linear(
-            in_features=tensorrt_llm_layer.in_features * tensorrt_llm_layer.tp_size,
-            out_features=tensorrt_llm_layer.out_features,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-        )
-    else:
-        assert False, f"{linear_layer_type} is not supported."
-
-    quantized_linear_layer.weight = tensorrt_llm_layer.weight
-    quantized_linear_layer.bias = tensorrt_llm_layer.bias
-
-    quantized_linear_layer.activation_scaling_factor.value = layer_config.activation_scaling_factor
-    quantized_linear_layer.weights_scaling_factor.value = layer_config.weights_scaling_factor
-
-    if hasattr(quantized_linear_layer, "prequant_scaling_factor"):
-        quantized_linear_layer.prequant_scaling_factor.value = layer_config.prequant_scaling_factor
-
-    return quantized_linear_layer
-
-
-def naive_quantization(config: ModelConfig, quantization: str):
-    """Generates a constant scaling factor (1) with target quantization.
-
-    This is for debugging and performance measurement only.
-    """
-    config.quantization = quantization
-    # Here the scaling factor is not inversed.
-    # In nvidia systems:
-    # pytorch_quantization uses inv scale
-    # onnx & trt uses non-inv scale
-    # cask uses inv scale
-    default_scaling_factor = np.array([1], dtype=np.float32)
-
-    if quantization == QUANTIZATION_FP8:
-        for layer in config.layers:
-            linear_layers = [
-                layer.attention.qkv,
-                layer.attention.dense,
-                layer.mlp.fc,
-                layer.mlp.proj,
-                layer.mlp.gate,
-            ]
-            for linear_layer in linear_layers:
-                if linear_layer:
-                    linear_layer.activation_scaling_factor = default_scaling_factor
-                    linear_layer.weights_scaling_factor = default_scaling_factor
-        config.lm_head.activation_scaling_factor = default_scaling_factor
-        config.lm_head.weights_scaling_factor = default_scaling_factor
-
-    else:
-        assert False, f"{quantization} not supported"
diff --git a/nemo/export/trt_llm/tensor_utils.py b/nemo/export/trt_llm/tensor_utils.py
deleted file mode 100644
index 2fce81b91647..000000000000
--- a/nemo/export/trt_llm/tensor_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Dict
-
-import numpy as np
-import tensorrt as trt
-import tensorrt_llm
-import torch
-
-
-def torch_to_numpy_with_dtype(tensor, dtype=trt.float16):
-    """Converts a torch tensor to numpy array with the dtype."""
-    if dtype == trt.float16:
-        torch_dtype = torch.float16
-    elif dtype == trt.float32:
-        torch_dtype = torch.float32
-    elif dtype == trt.bfloat16:
-        torch_dtype = torch.bfloat16
-    else:
-        assert False, f"{dtype} not supported"
-    return tensorrt_llm._utils.torch_to_numpy(tensor.detach().to(torch_dtype))
-
-
-def split(v, tp_size, idx, dim=0):
-    """Splits the np tensor v on dim and return the idx's slice."""
-    if tp_size == 1:
-        return v
-    if len(v.shape) == 1:
-        return np.ascontiguousarray(np.split(v, tp_size)[idx])
-    else:
-        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
-
-
-def get_tensor_parallel_group(tensor_parallel: int):
-    """Returns the tensor_parallel_group config based on tensor_parallel."""
-    from mpi4py import MPI
-
-    mpi_rank = MPI.COMM_WORLD.Get_rank()
-    offset = mpi_rank - mpi_rank % tensor_parallel
-    tp_group = list(range(offset, offset + tensor_parallel))
-    return None if tensor_parallel == 1 else tp_group
-
-
-def get_tensor_from_dict(weights_dict: Dict[str, np.ndarray], name: str) -> np.array:
-    """Loads tensor from the weights_dict."""
-    return weights_dict.get(f"model.{name}.bin", None)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 3ad27a2eb9a6..bbafec319fd5 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -13,338 +13,104 @@
 # limitations under the License.
 
 
-import argparse
 import logging
-import os
-import time
-from pathlib import Path
-from typing import List
-
-import tensorrt as trt
 import tensorrt_llm
-import torch
-from tensorrt_llm import str_dtype_to_trt
-from tensorrt_llm._utils import np_dtype_to_trt
-from tensorrt_llm.builder import Builder
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig, Builder
+from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
-from tensorrt_llm.models.modeling_utils import add_lora
-from tensorrt_llm.network import net_guard
-from tensorrt_llm.plugin.plugin import ContextFMHAType
-from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.lora_manager import LoraBuildConfig
+from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
+from tensorrt_llm.plugin import PluginConfig
 
 MODEL_NAME = "NeMo"
 
 LOGGER = logging.getLogger("NeMo")
 
 
-def get_engine_name(model, dtype, tp_size, pp_size, rank):
-    """Returns the engine file name based on the provided info."""
-    if pp_size == 1:
-        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, pp_size, rank)
-
-
-def serialize_engine(engine, path):
-    """Serializes the engine to path."""
-    logger.info(f"Serializing engine to {path}...")
-    tik = time.time()
-    with open(path, "wb") as f:
-        f.write(bytearray(engine))
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Engine serialized. Total time: {t}")
-
-
-def refit_runtime_engine(params, cuda_engine):
-    '''
-        @brief: Inplace refit one TensorRT cuda engine using weights from the network,
-            user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
-        @param engine_buffer: A serialized TensorRT engine.
-        @param network: Network object.
-        @return: A serialized TRT engine if refit successfully, None otherwise
-    '''
-    logger.info(f'Refit runtime engine')
-    tik = time.time()
-
-    # Refit engine
-    assert params is not None
-    refitter = trt.Refitter(cuda_engine, logger.trt_logger)
-    for name, param in params:
-        trt_param = trt.Weights(np_dtype_to_trt(param._value.dtype), param._value.ctypes.data, param._value.size)
-
-        if trt_param is None or not refitter.set_named_weights(name, trt_param):
-            logger.error(f'Failed to refit weight: {name}')
-            return None
-
-    if not refitter.refit_cuda_engine():
-        logger.error(f'Failed to refit engine.')
-        return None
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Total time of refitting {cuda_engine.name}: {t}')
-
-    return cuda_engine
-
-
-def build_rank_engine(
-    tensorrt_llm_gpt, builder: Builder, builder_config: tensorrt_llm.builder.BuilderConfig, engine_name, args,
+def build_and_save_engine(
+    max_input_len=1024,
+    max_output_len=1024,
+    max_batch_size=4,
+    model_dir=None,
+    model_weights=None,
+    model_config=None,
+    model_type='gpt',
+    lora_ckpt_list=None,
+    use_lora_plugin=None,
+    max_lora_rank=64,
+    lora_target_modules=None,
+    max_prompt_embedding_table_size=0,
+    enable_multi_block_mode: bool = False,
+    paged_kv_cache: bool = True,
+    remove_input_padding: bool = True,
+    max_num_tokens: int = None,
+    opt_num_tokens: int = None,
+    max_beam_width: int = 1,
+    tokens_per_block: int = 128,
 ):
-
-    str_dtype_to_trt(args.dtype)
-    ootb = os.getenv("OOTB", False)
-
-    network = builder.create_network()
-    network.trt_network.name = engine_name
-
-    # We have to use the attention plugin for most of the models.
-    if args.use_gpt_attention_plugin:
-        network.plugin_config.set_gpt_attention_plugin(dtype=args.use_gpt_attention_plugin)
-
-    if not ootb:
-        network.plugin_config.use_custom_all_reduce = False
-
-        if args.use_gemm_plugin:
-            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
-        if args.use_layernorm_plugin:
-            network.plugin_config.set_layernorm_plugin(dtype=args.use_layernorm_plugin)
-        assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
-        if args.enable_context_fmha:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
-        if args.enable_context_fmha_fp32_acc:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled_with_fp32_acc)
-        if args.remove_input_padding:
-            network.plugin_config.enable_remove_input_padding()
-        else:
-            network.plugin_config.remove_input_padding = False
-        if args.paged_kv_cache:
-            network.plugin_config.enable_paged_kv_cache()
-        else:
-            network.plugin_config.paged_kv_cache = False
-        if args.use_ib_gpt_attention_plugin:
-            network.plugin_config.set_inflight_batching_gpt_attention_plugin(dtype=args.use_ib_gpt_attention_plugin)
-        if args.enable_multi_block_mode:
-            network.plugin_config.enable_mmha_multi_block_mode()
-
-        if args.use_lora_plugin:
-            network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
-
-        if args.use_lookup_plugin:
-            # Use the plugin for the embedding parallelism and sharing
-            network.plugin_config.set_lookup_plugin(dtype=args.dtype)
+    try:
+        model_cls = getattr(tensorrt_llm.models, model_config.architecture)
+    except:
+        raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
+
+    logger.set_level("info")
+    str_dtype = model_config.dtype
+    plugin_config = PluginConfig()
+    plugin_config.set_gpt_attention_plugin(dtype=str_dtype)
+    plugin_config.set_gemm_plugin(dtype=str_dtype)
+    plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode)
+    if paged_kv_cache:
+        plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
     else:
-        LOGGER.warning("Build engine in OOTB mode, disable all plugins except nccl.")
-
-    if args.mapping.world_size > 1:
-        network.plugin_config.set_nccl_plugin(args.dtype)
-
-    with net_guard(network):
-        # Prepare
-        network.set_named_parameters(tensorrt_llm_gpt.named_parameters())
-
-        # Forward
-        inputs = tensorrt_llm_gpt.prepare_inputs(
-            max_batch_size=args.max_batch_size,
-            max_input_len=args.max_input_len,
-            max_new_tokens=args.max_input_len + args.max_output_len,
-            use_cache=True,
-            max_beam_width=args.max_beam_width,
-            paged_kv_cache=args.paged_kv_cache,
-            tokens_per_block=args.tokens_per_block,
-            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            lora_target_modules=args.lora_target_modules,
-        )
-        tensorrt_llm_gpt(*inputs)
-
-    # Network -> Engine
-    engine = builder.build_engine(network, builder_config)
-    if args.mapping.rank == 0 or args.use_refit:
-        config_path = args.output_dir / "config.json"
-        builder.save_config(builder_config, config_path)
-    return engine
-
-
-def _build_impl(tensorrt_llm_model, args):
-    torch.cuda.set_device(args.mapping.rank % args.gpus_per_node)
-    tensorrt_llm.logger.set_level(args.log_level)
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache"
-    timing_cache = timing_cache_file
-
-    if args.use_lora_plugin is not None:
-        add_lora(tensorrt_llm_model, args.max_lora_rank)
-
-    builder = Builder()
-    apply_query_key_layer_scaling = False
-
-    builder_config = builder.create_builder_config(
-        name=MODEL_NAME,
-        precision=args.dtype,
-        timing_cache=timing_cache,
-        tensor_parallel=args.mapping.tp_size,
-        pipeline_parallel=args.mapping.pp_size,
-        world_size=args.mapping.tp_size * args.mapping.pp_size,
-        parallel_build=args.parallel_build,
-        num_layers=tensorrt_llm_model._num_layers,
-        num_heads=tensorrt_llm_model._num_heads,
-        num_kv_heads=tensorrt_llm_model._num_kv_heads,
-        head_size=tensorrt_llm_model._head_size,
-        hidden_size=tensorrt_llm_model._hidden_size,
-        vocab_size=tensorrt_llm_model._vocab_size,
-        hidden_act=tensorrt_llm_model.hidden_act,
-        max_position_embeddings=tensorrt_llm_model.max_position_embeddings,
-        add_bos=tensorrt_llm_model._add_bos,
-        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-        max_batch_size=args.max_batch_size,
-        max_input_len=args.max_input_len,
-        max_output_len=args.max_output_len,
-        max_beam_width=args.max_beam_width,
-        max_num_tokens=None,
-        max_draft_len=0,
-        int8="int8" in args.quantization,
-        opt_level=args.builder_opt,
-        paged_kv_cache=args.paged_kv_cache,
-        tokens_per_block=args.tokens_per_block,
-        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-        use_parallel_embedding=args.use_parallel_embedding,
-        embedding_sharding_dim=args.embedding_sharding_dim,
-        fp8="fp8" in args.quantization,
-        use_refit=args.use_refit,
-        gather_context_logits=False,
-        gather_generation_logits=False,
-        quant_mode=args.quant_mode,
-        lora_target_modules=args.lora_target_modules,
-        max_lora_rank=args.max_lora_rank,
+        plugin_config.paged_kv_cache = False
+    plugin_config.remove_input_padding = remove_input_padding
+
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=max_num_tokens,
+        opt_num_tokens=opt_num_tokens,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_beam_width=max_beam_width,
+        remove_input_padding=remove_input_padding,
+        enable_context_fmha=plugin_config.context_fmha,
+        tokens_per_block=tokens_per_block,
     )
 
-    tp_size = args.mapping.tp_size
-    pp_size = args.mapping.pp_size
-    rank = args.mapping.rank
-    engine_name = get_engine_name(MODEL_NAME, args.dtype, tp_size, pp_size, rank)
-    engine = build_rank_engine(tensorrt_llm_model, builder, builder_config, engine_name, args)
-    assert engine is not None, f"Failed to build engine for rank {rank}"
-
-    serialize_engine(engine, args.output_dir / engine_name)
-
-    if args.mapping.rank == 0:
-        ok = builder.save_timing_cache(builder_config, timing_cache_file)
-        assert ok, "Failed to save timing cache."
-
-
-def build(
-    tensorrt_llm_model,
-    output_dir: Path,
-    mapping=None,
-    dtype="float16",
-    timing_cache="",
-    log_level="info",
-    max_batch_size=1,
-    max_input_len=200,
-    max_output_len=200,
-    max_beam_width=1,
-    max_prompt_embedding_table_size=0,
-    parallel_build=False,
-    gpus_per_node=1,
-    quantization=None,
-    use_inflight_batching=False,
-    paged_kv_cache=False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode=False,
-    use_refit=False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """Builds the tensorrt_llm_model to engine."""
-    args = argparse.Namespace()
-    args.mapping = mapping
-    args.dtype = dtype
-    args.timing_cache = timing_cache
-    args.log_level = log_level
-    args.max_batch_size = max_batch_size
-    args.max_input_len = max_input_len
-    args.max_output_len = max_output_len
-    args.max_beam_width = max_beam_width
-    args.use_gpt_attention_plugin = dtype
-    args.use_gemm_plugin = dtype
-    args.use_layernorm_plugin = False
-    args.parallel_build = parallel_build
-    args.enable_context_fmha = enable_context_fmha
-    args.enable_context_fmha_fp32_acc = False
-    args.gpus_per_node = gpus_per_node
-    args.builder_opt = None
-    args.output_dir = Path(output_dir)
-    args.remove_input_padding = True
-    args.use_smooth_quant = False
-    args.use_weight_only = False
-    args.weight_only_precision = "int8"
-    args.per_channel = False
-    args.per_token = False
-    args.int8_kv_cache = False
-    args.random_seed = None
-    args.paged_kv_cache = paged_kv_cache
-    args.max_prompt_embedding_table_size = max_prompt_embedding_table_size
-    args.use_inflight_batching = use_inflight_batching
-    args.use_ib_gpt_attention_plugin = False
-    args.use_parallel_embedding = False
-    args.embedding_sharding_dim = 0
-    args.use_lookup_plugin = False
-    args.tokens_per_block = 64
-    args.quantization = quantization
-    args.enable_multi_block_mode = enable_multi_block_mode
-    args.use_refit = use_refit
-    args.use_lora_plugin = use_lora_plugin
-    args.lora_target_modules = lora_target_modules
-    args.max_lora_rank = max_lora_rank
-
-    logger.set_level(args.log_level)
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    if args.use_ib_gpt_attention_plugin:
-        logger.warning(
-            "use_ib_gpt_attention_plugin is deprecated. Use combination of"
-            " --use_gpt_attention_plugin=dtype --use_inflight_batching instead."
+    build_dict = {
+        'max_input_len': max_input_len,
+        'max_output_len': max_output_len,
+        'max_batch_size': max_batch_size,
+        'max_beam_width': max_beam_width,
+        'max_num_tokens': max_num_tokens,
+        'opt_num_tokens': opt_num_tokens,
+        'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+        'gather_context_logits': False,
+        'gather_generation_logits': False,
+        'strongly_typed': False,
+        'builder_opt': None,
+    }
+    build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+    if use_lora_plugin is not None:
+        build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+        lora_config = LoraBuildConfig(
+            lora_dir=lora_ckpt_list,
+            lora_ckpt_source='nemo',
+            max_lora_rank=max_lora_rank,
+            lora_target_modules=lora_target_modules,
         )
+        build_config.lora_config = lora_config
 
-    if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-
-        if not args.paged_kv_cache:
-            logger.warning("Paged kv cache feature will enabled for in-flight batching mode.")
-            args.paged_kv_cache = True
-
-        if not args.remove_input_padding:
-            logger.warning("Remove input padding feature will enabled for in-flight batching mode.")
-            args.remove_input_padding = True
-
-    if args.use_smooth_quant:
-        args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel)
-    elif args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(args.weight_only_precision == "int4")
-    else:
-        args.quant_mode = QuantMode(0)
-
-    if args.int8_kv_cache:
-        args.quant_mode = args.quant_mode.set_int8_kv_cache()
-
-    if args.random_seed is not None:
-        torch.manual_seed(args.random_seed)
-
-    if args.mapping.is_first_pp_rank():
-        if tensorrt_llm_model._modules['vocab_embedding'].tp_size > 1:
-            args.use_parallel_embedding = True
-            args.embedding_sharding_dim = tensorrt_llm_model._modules['vocab_embedding'].sharding_dim
-
-    tik = time.time()
-    _build_impl(tensorrt_llm_model, args)
+    model = model_cls.from_config(model_config)
+    model = optimize_model(
+        model,
+        use_parallel_embedding=model_config.use_parallel_embedding,
+        share_embedding_table=model_config.share_embedding_table,
+    )
+    preprocess_weights(model_weights, model_config)
+    model.load(model_weights)
+    engine = build_trtllm(model, build_config)
+    engine.save(model_dir)
 
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}")
+    return engine
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
deleted file mode 100644
index 52e9c4960fc9..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from pathlib import Path
-from typing import List
-
-import numpy as np
-import torch
-from tensorrt_llm import default_net, str_dtype_to_trt
-from tensorrt_llm._utils import trt_dtype_to_str
-from tensorrt_llm.functional import expand_mask, gather_last_token_logits, recv, send, shape
-from tensorrt_llm.layers import AttentionParams, ColumnLinear, KeyValueCacheParams, LoraParams
-from tensorrt_llm.models.generation_mixin import GenerationMixin
-from tensorrt_llm.module import Module, ModuleList
-
-from nemo.export.trt_llm.decoder import build_decoder_layer
-from nemo.export.trt_llm.model_config import DECODER_GEMMA, ModelConfig
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensorrt_llm_build import build
-from nemo.export.trt_llm.tensorrt_llm_utils import (
-    build_embedding_from_config,
-    build_layernorm_from_config,
-    print_tensorrt_llm,
-)
-
-
-def get_transformer_layers(mapping, num_layers):
-    layers_per_pipeline_stage = num_layers // mapping.pp_size
-    layers_range = list(
-        range(mapping.pp_rank * layers_per_pipeline_stage, (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)
-    )
-    return layers_range
-
-
-class ModelBuilder(Module):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__()
-        self.quantization = model_config.quantization
-        self.max_position_embeddings = model_config.max_position_embeddings
-        self.hidden_act = model_config.hidden_act
-
-        self._dtype = str_dtype_to_trt(model_config.dtype)
-        self._kv_dtype = self._dtype
-        self._tensor_parallel = model_config.mapping.tp_size
-        self._vocab_size = model_config.vocab_size
-        self._hidden_size = model_config.hidden_size
-        self._num_layers = len(model_config.layers)
-        self._num_heads = model_config.num_attention_heads
-        self._num_kv_heads = model_config.num_kv_heads
-        self._head_size = (
-            model_config.hidden_size // model_config.num_attention_heads
-            if model_config.head_size is None
-            else model_config.head_size
-        )
-        self._use_prompt_tuning = model_config.use_prompt_tuning
-        self._add_bos = model_config.layers[0].decoder_type == DECODER_GEMMA
-        self._mapping = model_config.mapping
-        self.rank = model_config.mapping.rank
-        self.max_lora_rank = model_config.max_lora_rank
-
-        if self._mapping.is_first_pp_rank():
-            self.vocab_embedding = build_embedding_from_config(
-                model_config.vocab_embedding,
-                self._dtype,
-                use_prompt_tuning=self._use_prompt_tuning,
-                tensor_parallel=model_config.mapping.tp_size,
-                tensor_parallel_rank=model_config.mapping.tp_rank,
-            )
-
-            if model_config.positional_embedding.weight is not None:
-                self.positional_embedding = build_embedding_from_config(
-                    model_config.positional_embedding,
-                    self._dtype,
-                    tensor_parallel=model_config.mapping.tp_size,
-                    tensor_parallel_rank=model_config.mapping.tp_rank,
-                )
-
-        self.layers = []
-        for layer_id in get_transformer_layers(self._mapping, self._num_layers):
-            model_config.layers[layer_id].max_lora_rank = self.max_lora_rank
-            self.layers.append(
-                build_decoder_layer(
-                    model_config.layers[layer_id],
-                    layer_id,
-                    self._num_layers,
-                    dtype=self._dtype,
-                    quantization=model_config.quantization,
-                    rank=self.rank,
-                    tensor_parallel=self._tensor_parallel,
-                    tp_group=model_config.mapping.tp_group,
-                )
-            )
-
-        self.layers = ModuleList(self.layers)
-
-        if self._mapping.is_last_pp_rank():
-            self.ln_f = build_layernorm_from_config(model_config.final_layernorm, self._dtype)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-        ptuning_args = []
-        if self._use_prompt_tuning:
-            ptuning_args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size]
-
-        if self._mapping.is_first_pp_rank():
-            x = self.vocab_embedding(input_ids, *ptuning_args)
-            if hasattr(self, "positional_embedding") and self.positional_embedding:
-                assert position_ids
-                x = x + self.positional_embedding(position_ids)
-            hidden_states = x
-        else:
-            hidden_states = recv(hidden_states, self._mapping.prev_pp_rank())
-
-        kv_cache_params.fill_none_tensor_list(len(self.layers))
-
-        if use_cache:
-            presents = []
-
-        if attention_mask is not None:
-            attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
-
-        for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)):
-
-            decoder_params = {
-                "hidden_states": hidden_states,
-                "attention_mask": attention_mask,
-                "use_cache": use_cache,
-                "kv_cache_params": KeyValueCacheParams(
-                    past_key_value=[past],
-                    host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths,
-                    kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers,
-                    host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes,
-                    cache_indirection=kv_cache_params.cache_indirection,
-                    host_sink_token_length=kv_cache_params.host_sink_token_length,
-                    host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers,
-                ),
-                "attention_params": attention_params,
-            }
-
-            if lora_params.lora_ranks is not None:
-                decoder_params["lora_layer_params"] = lora_params.get_layer_params(layer_idx)
-
-            hidden_states = layer(**decoder_params)
-
-            if use_cache:
-                presents.append(hidden_states[1])
-                hidden_states = hidden_states[0]
-
-        if self._mapping.is_last_pp_rank():
-            hidden_states = self.ln_f(hidden_states)
-        else:
-            hidden_states = send(hidden_states, self._mapping.next_pp_rank())
-
-        if use_cache:
-            return hidden_states, tuple(presents)
-        return hidden_states
-
-
-class LMHeadModelBuilder(ModelBuilder, GenerationMixin):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__(model_config)
-
-        if self._mapping.is_last_pp_rank():
-            self.lm_head = ColumnLinear(
-                self._hidden_size,
-                model_config.vocab_size_padded,
-                bias=False,
-                dtype=self._dtype,
-                tp_group=self._mapping.tp_group,
-                tp_size=self._tensor_parallel,
-                gather_output=True,
-                share_weight=None,
-            )
-            self.lm_head.weight.value = model_config.lm_head.weight
-            if model_config.quantization:
-                self.lm_head = quantize_linear(self.lm_head, model_config.quantization, model_config.lm_head)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        last_token_ids=None,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-
-        hidden_states = super().forward(
-            input_ids,
-            position_ids,
-            use_cache,
-            attention_mask,
-            kv_cache_params,
-            attention_params,
-            prompt_embedding_table,
-            prompt_tasks,
-            prompt_vocab_size,
-            inflight_batching_args,
-            hidden_states,
-            lora_params,
-        )
-
-        if use_cache:
-            hidden_states, presents = hidden_states
-
-        if self._mapping.is_last_pp_rank():
-            assert last_token_ids is not None, "Expecting last token ids to be not None"
-            hidden_states = gather_last_token_logits(
-                hidden_states, last_token_ids, default_net().plugin_config.remove_input_padding
-            )
-
-            # [batch_size, hidden_size] -> [batch_size, vocab_size]
-            lm_logits = self.lm_head(hidden_states)
-            lm_logits.mark_output("logits", str_dtype_to_trt("float16"))
-        else:
-            hidden_states.mark_output('hidden_states_output', self._dtype)
-
-        if use_cache:
-            if not default_net().plugin_config.paged_kv_cache:
-                for i, present in zip(self._mapping.pp_layers(self._num_layers), presents):
-                    present.mark_output(f'present_key_value_{i}', self._kv_dtype)
-            if self._mapping.is_last_pp_rank():
-                return (lm_logits, presents)
-            return (hidden_states, presents)
-        else:
-            if self._mapping.is_last_pp_rank():
-                return lm_logits
-            return hidden_states
-
-    def prepare_inputs(
-        self,
-        max_batch_size,
-        max_input_len,
-        max_new_tokens,
-        use_cache=True,
-        max_beam_width: int = 1,
-        paged_kv_cache: bool = False,
-        tokens_per_block: int = 64,
-        prompt_embedding_table_size: int = 0,
-        lora_target_modules: List[str] = None,
-    ):
-
-        # Prepare inputs
-        head_size = self._head_size
-        num_heads_kv = self._num_kv_heads
-        remove_input_padding = default_net().plugin_config.remove_input_padding
-        use_gpt_attention_plugin = default_net().plugin_config.gpt_attention_plugin
-        use_gemm_plugin = default_net().plugin_config.gemm_plugin
-        use_custom_all_reduce = default_net().plugin_config.use_custom_all_reduce
-        use_lora_plugin = default_net().plugin_config.lora_plugin
-
-        model_inputs = self.prepare_basic_inputs(
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            max_input_len=max_input_len,
-            max_seq_len=max_new_tokens,
-            num_kv_heads=num_heads_kv,
-            head_size=head_size,
-            num_layers=self._num_layers,
-            kv_dtype=self._kv_dtype,
-            remove_input_padding=remove_input_padding,
-            use_gpt_attention_plugin=use_gpt_attention_plugin,
-            use_gemm_plugin=use_gemm_plugin,
-            paged_kv_cache=paged_kv_cache,
-            tokens_per_block=tokens_per_block,
-            gather_context_logits=False,
-            gather_generation_logits=False,
-            dtype=self._dtype,
-            num_heads=self._num_heads,
-            mapping=self._mapping,
-            max_num_tokens=None,
-            prompt_embedding_table_size=prompt_embedding_table_size,
-            position_encoding_2d=False,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_draft_len=0,
-            use_custom_all_reduce=use_custom_all_reduce,
-        )
-
-        inflight_batching_args = None
-
-        return (
-            model_inputs["input_ids"],
-            model_inputs["position_ids"],
-            use_cache,
-            model_inputs["last_token_ids"],
-            model_inputs["attention_mask"],
-            KeyValueCacheParams(
-                past_key_value=model_inputs['past_key_value'],
-                host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'],
-                host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'],
-                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'],
-                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'],
-                cache_indirection=model_inputs['cache_indirection'],
-                host_sink_token_length=model_inputs['host_sink_token_length'],
-            ),
-            AttentionParams(
-                sequence_length=model_inputs['sequence_length'],
-                context_lengths=model_inputs['context_lengths'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-            model_inputs['prompt_embedding_table'],
-            model_inputs['tasks'],
-            model_inputs['prompt_vocab_size'],
-            inflight_batching_args,
-            model_inputs["hidden_states_input"],
-            LoraParams(
-                model_inputs['lora_ranks'],
-                model_inputs['lora_weights_pointers'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-        )
-
-    def build(
-        self,
-        output_dir: Path,
-        timing_cache: str = "",
-        log_level: str = "info",
-        max_batch_size: int = 1,
-        max_input_len: int = 200,
-        max_output_len: int = 200,
-        max_beam_width: int = 1,
-        parallel_build: bool = False,
-        max_prompt_embedding_table_size: int = 0,
-        use_inflight_batching: bool = False,
-        paged_kv_cache: bool = False,
-        enable_context_fmha: bool = True,
-        enable_multi_block_mode: bool = False,
-        use_refit: bool = False,
-        use_lora_plugin: str = None,
-        lora_target_modules: List[str] = None,
-        max_lora_rank: int = 64,
-    ):
-
-        if self.rank > torch.cuda.device_count():
-            print(f"warning: Rank {self.rank} larger than GPUs available ({torch.cuda.device_count()})")
-
-        build(
-            tensorrt_llm_model=self,
-            output_dir=output_dir,
-            mapping=self._mapping,
-            dtype=trt_dtype_to_str(self._dtype),
-            timing_cache=timing_cache,
-            log_level=log_level,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            parallel_build=parallel_build,
-            gpus_per_node=torch.cuda.device_count(),
-            quantization=self.quantization,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
-
-    def print(self):
-        np.set_printoptions(threshold=36)
-        print_tensorrt_llm(f"rank.{self.rank}", self)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index d7e3e40c87a2..8fdd747dcb90 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -13,28 +13,24 @@
 # limitations under the License.
 
 
+import csv
 import json
 import logging
 import os
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import numpy as np
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine  # isort:skip
-from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
-
 
 LOGGER = logging.getLogger("NeMo")
 
@@ -55,7 +51,7 @@ class TensorrtLLMHostContext:
 class TensorrtLLMWorkerContext:
     """The MPI worker side context for TRT LLM inference."""
 
-    decoder: tensorrt_llm.runtime.GenerationSession = None
+    decoder: ModelRunner = None
     sampling_config: SamplingConfig = None
     lora_manager: LoraManager = None
     max_batch_size: int = 0
@@ -128,49 +124,60 @@ def _read_config(config_path: Path):
     return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
 
 
-def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1):
+def _load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir,
+    lora_ckpt_list=None,
+    num_beams=1,
+    use_python_runtime: bool = True,
+):
     """The impl of `load` API for on a single GPU worker."""
     try:
         tensorrt_llm.logger.set_level("info")
 
         engine_dir = Path(engine_dir)
         config_path = engine_dir / "config.json"
-        model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
+        # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
 
-        runtime_rank = tensorrt_llm.mpi_rank()
+        with open(config_path, "r") as f:
+            config = json.load(f)
 
-        assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
-        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size)
+        max_batch_size = config["build_config"]["max_batch_size"]
+        max_input_len = config["build_config"]["max_input_len"]
+        max_output_len = config["build_config"]["max_output_len"]
+        max_beam_width = config["build_config"]["max_beam_width"]
 
-        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-        engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank)
-        serialize_path = os.path.join(engine_dir, engine_name)
-        logger.info(f"Reading from serialize path {serialize_path}")
+        runtime_rank = tensorrt_llm.mpi_rank()
 
-        with open(serialize_path, "rb") as f:
-            engine_buffer = f.read()
-        decoder = tensorrt_llm.runtime.GenerationSession(
-            model_config, engine_buffer, runtime_mapping, debug_mode=False
-        )
+        if use_python_runtime:
+            decoder = ModelRunner.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                debug_mode=False,
+            )
+        else:
+            decoder = ModelRunnerCpp.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                max_batch_size=max_batch_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_beam_width=max_beam_width,
+                debug_mode=False,
+            )
 
         sampling_config = SamplingConfig(
             end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
         )
 
-        if decoder.use_lora_plugin:
-            lora_manager = LoraManager()
-            if lora_ckpt_list is not None:
-                lora_manager.load_from_nemo(
-                    model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
-                )
-        else:
-            lora_manager = None
-
         # Initialize the global context so it can be used during `run` API.
         global tensorrt_llm_worker_context
         tensorrt_llm_worker_context.decoder = decoder
         tensorrt_llm_worker_context.sampling_config = sampling_config
-        tensorrt_llm_worker_context.lora_manager = lora_manager
         tensorrt_llm_worker_context.max_batch_size = max_batch_size
         tensorrt_llm_worker_context.max_input_len = max_input_len
 
@@ -207,7 +214,6 @@ def _forward(
         decoder = tensorrt_llm_worker_context.decoder
         assert decoder is not None, "Invalid worker context, decoder is not loaded."
         sampling_config = tensorrt_llm_worker_context.sampling_config
-        lora_manager = tensorrt_llm_worker_context.lora_manager
         max_batch_size = tensorrt_llm_worker_context.max_batch_size
         max_input_len = tensorrt_llm_worker_context.max_input_len
 
@@ -217,60 +223,47 @@ def _forward(
         max_length = max(input_lengths)
         assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
         pad_id = sampling_config.pad_id
+        end_id = sampling_config.end_id
+        num_beams = sampling_config.num_beams
 
-        if decoder.remove_input_padding:
-            line_encoded = torch.concat(input_tensors).cuda()
-        else:
-            line_encoded = torch.nested.to_padded_tensor(
-                torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id
-            ).cuda()
-
-        input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda()
-
-        if prompt_table is None:
-            ptuning_args = []
-        else:
-            if task_vocab_size is None:
-                raise Exception("task_vocab_size cannot be None")
+        with torch.no_grad():
+            prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
 
-            task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda")
-            task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda")
-            prompt_table = prompt_table.cuda()
-            ptuning_args = [prompt_table, task_ids, task_vocab_size]
+            if prompt_table is not None:
+                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
+                tmp_dir = tempfile.TemporaryDirectory()
+                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
+                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
+                prompt_table = prompt_table_path
 
-        with torch.no_grad():
-            sampling_config.top_k = top_k
-            sampling_config.top_p = top_p
-            sampling_config.temperature = temperature
-            for key, param in sampling_kwargs.items():
-                # set any additional SamplingConfig kwargs
-                setattr(sampling_config, key, param)
-
-            decoder.setup(
-                batch_size,
-                max_context_length=max_length,
+            outputs = decoder.generate(
+                input_tensors,
                 max_new_tokens=max_output_len,
-                lora_manager=lora_manager,
-                lora_uids=lora_uids,
-            )
-
-            outputs = decoder.decode(
-                line_encoded,
-                input_lengths,
-                sampling_config,
-                *ptuning_args,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_beams=num_beams,
                 stop_words_list=stop_words_list,
                 bad_words_list=bad_words_list,
-                no_repeat_ngram_size=no_repeat_ngram_size,
+                lora_uids=lora_uids,
+                prompt_table_path=prompt_table,
+                prompt_table=prompt_table,
+                prompt_tasks=prompt_tasks,
                 streaming=streaming,
                 output_sequence_lengths=True,
                 return_dict=True,
             )
+
             torch.cuda.synchronize()
 
+            if prompt_table is not None:
+                tmp_dir.cleanup()
+
         runtime_rank = tensorrt_llm.mpi_rank()
         if runtime_rank == 0 or multiprocessed_env:
-            return outputs, decoder.log_probs
+            return outputs
         else:
             return None
 
@@ -280,7 +273,11 @@ def _forward(
 
 
 def load(
-    tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1
+    tokenizer: PreTrainedTokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    use_python_runtime: bool = True,
 ) -> TensorrtLLMHostContext:
     """Loaded the compiled LLM model and run it.
 
@@ -290,22 +287,32 @@ def load(
     config_path = os.path.join(engine_dir, "config.json")
     with open(config_path, "r") as f:
         config = json.load(f)
-    world_size = config["builder_config"]["world_size"]
+    world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
+    elif tensorrt_llm.mpi_world_size() > 1:
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
+        executor = None
+        tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
         for _ in range(world_size):
-            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams)
+            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
             futures.append(future)
         for future in futures:
             future.result()
 
-    max_batch_size = config["builder_config"]["max_batch_size"]
-    max_input_len = config["builder_config"]["max_input_len"]
-    add_bos = config["builder_config"]["add_bos"]
+    max_batch_size = config["build_config"]["max_batch_size"]
+    max_input_len = config["build_config"]["max_input_len"]
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,
@@ -317,105 +324,6 @@ def load(
     )
 
 
-def load_refit(
-    tokenizer,
-    engine_dir: str,
-    lora_ckpt_list: List[str] = None,
-    num_beams: int = 1,
-    model_configs: List = None,
-    stream=None,
-) -> TensorrtLLMHostContext:
-    """Loaded the compiled LLM model and run it.
-
-    It also supports running the TRT LLM model on multi-GPU.
-    """
-
-    config_path = os.path.join(engine_dir, "config.json")
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    """The impl of `load` API for on a single GPU worker."""
-    tensorrt_llm.logger.set_level("error")
-
-    engine_dir = Path(engine_dir)
-    config_path = engine_dir / "config.json"
-
-    (
-        model_config,
-        world_size,
-        tensor_parallel_size,
-        pipeline_parallel_size,
-        dtype,
-        max_input_len,
-        max_batch_size,
-    ) = _read_config(config_path)
-
-    runtime_rank = torch.cuda.current_device()
-    assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
-
-    # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env.
-    assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch"
-    runtime_mapping = tensorrt_llm.Mapping(
-        world_size=tensorrt_llm.mpi_world_size(), rank=runtime_rank, tp_size=tensorrt_llm.mpi_world_size(), pp_size=1,
-    )
-
-    engine_name = get_engine_name(
-        MODEL_NAME, dtype, tensor_parallel_size, pipeline_parallel_size, tensorrt_llm.mpi_rank()
-    )
-
-    logger.info(f"Loading engine: Rank ({tensorrt_llm.mpi_rank()} -> {engine_dir}/{engine_name}")
-
-    serialize_path = os.path.join(engine_dir, engine_name)
-    with open(serialize_path, "rb") as f:
-        engine_buffer = f.read()
-
-    decoder = tensorrt_llm.runtime.GenerationSession(
-        model_config, engine_buffer, runtime_mapping, debug_mode=False, stream=stream
-    )
-    runtime_mapping.rank = runtime_rank
-    runtime_mapping.tp_group = get_tensor_parallel_group(
-        tensor_parallel_size
-    )  # Override the tp_group to support TP+DP
-    runtime_mapping.tp_rank = runtime_rank
-    runtime_mapping.tp_size = tensor_parallel_size
-    runtime_mapping.pp_group = [runtime_rank]
-    runtime_mapping.pp_rank = 0
-
-    sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams)
-
-    if decoder.use_lora_plugin:
-        lora_manager = LoraManager()
-        if lora_ckpt_list is not None:
-            lora_manager.load_from_nemo(
-                model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
-            )
-    else:
-        lora_manager = None
-
-    # create a new builder and refit the current engine
-    new_builder = LMHeadModelBuilder(model_configs[0])
-    engine = decoder.runtime.engine
-    refit_runtime_engine(new_builder.named_parameters(), engine)
-
-    # Initialize the global context so it can be used during `run` API.
-    global tensorrt_llm_worker_context
-    tensorrt_llm_worker_context.decoder = decoder
-    tensorrt_llm_worker_context.sampling_config = sampling_config
-    tensorrt_llm_worker_context.lora_manager = lora_manager
-    tensorrt_llm_worker_context.max_batch_size = max_batch_size
-    tensorrt_llm_worker_context.max_input_len = max_input_len
-
-    max_batch_size = config["builder_config"]["max_batch_size"]
-    max_input_len = config["builder_config"]["max_input_len"]
-
-    return TensorrtLLMHostContext(
-        executor=None,
-        world_size=world_size,
-        tokenizer=tokenizer,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-    )
-
-
 def forward(
     input_tensors: List[torch.IntTensor],
     max_output_len: int,
@@ -491,6 +399,47 @@ def forward(
         raise RuntimeError("Internal error")
 
 
+def prepare_input_tensors(
+    input_texts: List[str],
+    host_context: TensorrtLLMHostContext,
+    prompt_table=None,
+    task_vtoken_counts: List[int] = None,
+    task_ids: List[int] = None,
+):
+    tokenizer = host_context.tokenizer
+
+    if host_context.add_bos:
+        bos_tokens = [tokenizer.bos_token_id]
+    else:
+        bos_tokens = []
+
+    input_tokens = [bos_tokens + tokenizer.encode(t) for t in input_texts]
+
+    # If p-tuning is used, we need to prepend vtokens to each input.
+    if prompt_table is not None:
+
+        # Go over the tokenized prompts and prepend vtokens.
+        # The number of vtokens could be different for each task.
+        for prompt_index in range(len(input_texts)):
+            # Find out the number of vtokens to generate
+            task_id = task_ids[prompt_index]
+            num_vtokens = task_vtoken_counts[task_id]
+
+            # Create a tensor with vtokens, e.g. 32000, 32001, 32002... when vocab_size=32000
+            # TRT-LLM will convert each vtoken into its corresponding embedding row from the prompt table.
+            vocab_size = tokenizer.vocab_size
+            vtokens = list(range(vocab_size, vocab_size + num_vtokens))
+
+            # Concatenate the vtokens with the real tokens
+            real_tokens = input_tokens[prompt_index]
+            input_tokens[prompt_index] = vtokens + real_tokens
+
+    # Convert input token lists to tensors
+    input_tensors = [torch.IntTensor(token_list) for token_list in input_tokens]
+
+    return input_tensors
+
+
 def generate(
     input_texts: List[str],
     max_output_len: int,
@@ -500,6 +449,7 @@ def generate(
     temperature: float = 1.0,
     prompt_table=None,
     task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
     task_ids: List[int] = None,
     lora_uids: List[str] = None,
     stop_words_list=None,
@@ -515,11 +465,7 @@ def generate(
     Returns a 2D string list with shape [batch_size, num_beams].
     """
     tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
-    else:
-        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
 
     stop_words_list_tensors = None
     if stop_words_list is not None:
@@ -538,7 +484,7 @@ def generate(
     if no_repeat_ngram_size is not None:
         no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
 
-    outputs, log_probs = forward(
+    outputs = forward(
         input_tensors=input_tensors,
         max_output_len=max_output_len,
         host_context=host_context,
@@ -558,6 +504,8 @@ def generate(
         **sampling_kwargs,
     )
     assert outputs is not None
+    if tensorrt_llm.mpi_rank() != 0:
+        return None
 
     output_ids = outputs['output_ids']
     sequence_lengths = outputs['sequence_lengths']
@@ -582,6 +530,7 @@ def generate_streaming(
     temperature: float = 1.0,
     prompt_table=None,
     task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
     task_ids: List[int] = None,
     lora_uids: List[str] = None,
     stop_words_list=None,
@@ -594,11 +543,7 @@ def generate_streaming(
     Returns a 2D string list with shape [batch_size, num_beams].
     """
     tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
-    else:
-        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
 
     batch_size = len(input_texts)
 
@@ -621,7 +566,7 @@ def generate_streaming(
     if no_repeat_ngram_size is not None:
         no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
 
-    outputs, log_probs = forward(
+    outputs = forward(
         input_tensors=input_tensors,
         max_output_len=max_output_len,
         host_context=host_context,
@@ -681,3 +626,61 @@ def unload(host_context: TensorrtLLMHostContext):
     global tensorrt_llm_worker_context
     tensorrt_llm_worker_context.decoder = None
     tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
+
+
+def to_word_list_format(
+    word_dict: List[List[str]],
+    tokenizer=None,
+    ref_str="<extra_id_1>",
+):
+    '''
+    format of word_dict
+        len(word_dict) should be same to batch_size
+        word_dict[i] means the words for batch i
+        len(word_dict[i]) must be 1, which means it only contains 1 string
+        This string can contains several sentences and split by ",".
+        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
+        the ids for two short sentences " I am happy" and " I am sad".
+    '''
+    assert tokenizer is not None, "need to set tokenizer"
+
+    flat_ids = []
+    offsets = []
+    # The encoding of a single word can't always be trusted. See
+    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
+    ids_ref = tokenizer.encode(ref_str)
+    for word_dict_item in word_dict:
+        item_flat_ids = []
+        item_offsets = []
+
+        if isinstance(word_dict_item[0], bytes):
+            word_dict_item = [word_dict_item[0].decode()]
+
+        words = list(csv.reader(word_dict_item))[0]
+        for word in words:
+            ids = tokenizer.encode(f"{ref_str}{word}")
+            if ids[0 : len(ids_ref)] == ids_ref:
+                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
+                ids = ids[len(ids_ref) :]
+            else:
+                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
+                # for now we just use the basic encoding since this should be a very rare edge case.
+                ids = tokenizer.encode(word)
+                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
+
+            if len(ids) == 0:
+                continue
+
+            item_flat_ids += ids
+            item_offsets.append(len(ids))
+
+        flat_ids.append(np.array(item_flat_ids))
+        offsets.append(np.cumsum(np.array(item_offsets)))
+
+    pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
diff --git a/nemo/export/trt_llm/tensorrt_llm_utils.py b/nemo/export/trt_llm/tensorrt_llm_utils.py
deleted file mode 100644
index b732daca2525..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-import tensorrt as trt
-from tensorrt_llm.layers import Embedding, LayerNorm, PromptTuningEmbedding, RmsNorm
-from tensorrt_llm.module import Module
-
-from nemo.export.trt_llm.model_config import LAYERNORM_DEFAULT, LAYERNORM_RMS, EmbeddingConfig, LayernormConfig
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def build_embedding_from_config(
-    config: EmbeddingConfig,
-    dtype: trt.DataType,
-    tensor_parallel: int = 1,
-    tensor_parallel_rank: int = 0,
-    use_prompt_tuning: bool = False,
-):
-    """Returns the tensorrt_llm embedding layer from the embedding config."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-    EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding
-
-    trt_embedding = EmbeddingCls(
-        config.weight.shape[0] * tensor_parallel,
-        config.weight.shape[1],
-        dtype=dtype,
-        tp_size=tensor_parallel,
-        tp_rank=tensor_parallel_rank,
-        tp_group=get_tensor_parallel_group(tensor_parallel),
-    )
-    trt_embedding.weight.value = config.weight
-    return trt_embedding
-
-
-def build_layernorm_from_config(config: LayernormConfig, dtype: trt.DataType):
-    """Returns the tensorrt_llm layernorm layer from the torch layernorm."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-
-    if config.layernorm_type == LAYERNORM_DEFAULT:
-        trt_layernorm = LayerNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-        trt_layernorm.bias.value = config.bias
-    elif config.layernorm_type == LAYERNORM_RMS:
-        trt_layernorm = RmsNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-    else:
-        raise NotImplementedError(f"{config.layernorm_type} not supported")
-    return trt_layernorm
-
-
-def print_tensorrt_llm(name: str, tensorrt_llm_module: Module):
-    """Prints the tensorrt llm structure including weights and related data for debugging purpose."""
-    for tensor_name in [
-        "weight",
-        "bias",
-        "activation_scaling_factor",
-        "weights_scaling_factor",
-        "prequant_scaling_factor",
-    ]:
-        if hasattr(tensorrt_llm_module, tensor_name):
-            tensor = getattr(tensorrt_llm_module, tensor_name)
-            if tensor is not None:
-                LOGGER.info(f"{name}.{tensor_name}:{tensor._value.dtype}:{tensor._value.shape}:\n{tensor._value}")
-
-    for k, v in tensorrt_llm_module.named_children():
-        print_tensorrt_llm(f"{name}.{k}({v._get_name()})", v)
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
deleted file mode 100644
index 0f9fb66313b9..000000000000
--- a/nemo/export/trt_llm/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import pathlib
-import numpy as np
-import torch
-
-log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
-logging.basicConfig(format=log_format)
-LOGGER = logging.getLogger("NeMo")
-
-# numpy doesn't know bfloat16, define abstract binary type instead
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-
-def prompt_convert(prompt_config, prompt_weights):
-    if "task_templates" in prompt_config:
-        prompt_templates = prompt_config["task_templates"]
-        actual_task_id = 0
-        vtokens_embeddings = []
-        vtokens_len = []
-        for task_name_id, prompt_task in enumerate(prompt_templates):
-            prompt_task_name = prompt_task["taskname"]
-            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
-            prompt_task_weights = prompt_weights["prompt_table"].get(
-                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
-            )
-            if prompt_task_weights is None:
-                continue
-            vtokens_embeddings.append(prompt_task_weights)
-            vtokens_len.append(prompt_task_weights.shape[0])
-            actual_task_id += 1
-
-        max_vtoken_len = max(vtokens_len)
-        embedding_dim = vtokens_embeddings[0].shape[1]
-
-        # pad tasks to longest task embedding table
-        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
-            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
-            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
-            vtokens_embeddings[i] = padded_table
-
-        vtokens_embeddings = torch.stack(vtokens_embeddings)
-    else:
-        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
-
-    return vtokens_embeddings
-
-
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def is_nemo_file(path):
-    flag = False
-
-    if path is not None:
-        if len(path) > 5:
-            pc = pathlib.Path(path)
-            if pc.exists():
-                if pc.is_file():
-                    if path[-5 : len(path)] == ".nemo":
-                        flag = True
-
-    return flag
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index a508f29b9ace..e54f223f91cc 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -3,10 +3,17 @@
 from lightning_fabric.plugins.environments import slurm
 from pytorch_lightning import plugins as _pl_plugins
 
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
 from nemo.lightning.base import get_vocab_size, teardown
-from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
 from nemo.lightning.pytorch.strategies import MegatronStrategy
+from nemo.lightning.pytorch.trainer import Trainer
 
 
 # We monkey patch because nvidia uses a naming convention for SLURM jobs
@@ -21,4 +28,11 @@ def _is_slurm_interactive_mode():
 _pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001
 
 
-__all__ = ["MegatronStrategy", "MegatronDataSampler", "get_vocab_size", "teardown"]
+__all__ = [
+    "MegatronStrategy",
+    "MegatronDataSampler",
+    "MegatronMixedPrecision",
+    "Trainer",
+    "get_vocab_size",
+    "teardown",
+]
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index ab9fe40eb7a2..ba5daf12f95f 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -8,6 +8,7 @@
 from pytorch_lightning import Trainer
 from torch import nn
 
+
 DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
 NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
 DEFAULT_NEMO_DATASETS_CACHE = NEMO_CACHE_HOME / "datasets"
@@ -16,7 +17,11 @@
 NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
 
 
-def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int:
+def get_vocab_size(
+    config,
+    vocab_size: int,
+    make_vocab_size_divisible_by: int = 128,
+) -> int:
     from nemo.utils import logging
 
     after = vocab_size
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 794300db72f0..88e2f3436699 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -20,7 +20,10 @@ def create_dataloader(
 
 
 def setup_microbatch_calculator(
-    global_rank: int, micro_batch_size: int, global_batch_size: int, rampup_batch_size: Optional[List[int]] = None,
+    global_rank: int,
+    micro_batch_size: int,
+    global_batch_size: int,
+    rampup_batch_size: Optional[List[int]] = None,
 ) -> None:
     """
     Initializes the data for distributed training by setting up the microbatch calculator
@@ -41,7 +44,6 @@ def setup_microbatch_calculator(
 
     """
     from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE
-
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -189,8 +191,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py
new file mode 100644
index 000000000000..d1a193c5e728
--- /dev/null
+++ b/nemo/lightning/io/__init__.py
@@ -0,0 +1,25 @@
+from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
+from nemo.lightning.io.capture import reinit
+from nemo.lightning.io.connector import Connector, ModelConnector
+from nemo.lightning.io.mixin import ConnectorMixin, IOMixin
+from nemo.lightning.io.pl import TrainerCheckpoint, is_distributed_ckpt
+from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform
+
+__all__ = [
+    "apply_transforms",
+    "Connector",
+    "ConnectorMixin",
+    "IOMixin",
+    "import_ckpt",
+    "is_distributed_ckpt",
+    "export_ckpt",
+    "load",
+    "load_ckpt",
+    "ModelConnector",
+    "model_importer",
+    "model_exporter",
+    'reinit',
+    "state_transform",
+    "TrainerCheckpoint",
+    "TransformCTX",
+]
diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
new file mode 100644
index 000000000000..9af1d3d2a9d6
--- /dev/null
+++ b/nemo/lightning/io/api.py
@@ -0,0 +1,231 @@
+import pickle
+from pathlib import Path
+from typing import Any, Callable, Optional, Type, TypeVar
+
+import fiddle as fdl
+import pytorch_lightning as pl
+
+from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector
+from nemo.lightning.io.pl import TrainerCheckpoint
+
+CkptType = TypeVar("CkptType")
+
+
+def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
+    """
+    Loads a configuration from a pickle file and constructs an object of the specified type.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+        output_type (Type[CkptType]): The type of the object to be constructed from the loaded data.
+
+    Returns
+    -------
+        CkptType: An instance of the specified type constructed from the loaded configuration.
+
+    Raises
+    ------
+        FileNotFoundError: If the specified file does not exist.
+
+    Example:
+        loaded_model = load("/path/to/model", output_type=MyModel)
+    """
+    del output_type  # Just for type-hint
+
+    _path = Path(path)
+    if hasattr(_path, 'is_dir') and _path.is_dir():
+        _path = Path(_path) / "io.pkl"
+    elif hasattr(_path, 'isdir') and _path.isdir:
+        _path = Path(_path) / "io.pkl"
+
+    if not _path.is_file():
+        raise FileNotFoundError(f"No such file: '{_path}'")
+
+    with open(_path, "rb") as f:
+        config = pickle.load(f)
+
+    return fdl.build(config)
+
+
+def load_ckpt(path: Path) -> TrainerCheckpoint:
+    """
+    Loads a TrainerCheckpoint from a pickle file or directory.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+
+    Returns
+    -------
+        TrainerCheckpoint: The loaded TrainerCheckpoint instance.
+
+    Example:
+        checkpoint: TrainerCheckpoint = load_ckpt("/path/to/checkpoint")
+    """
+    return load(path, output_type=TrainerCheckpoint)
+
+
+def model_importer(
+    target: Type[ConnectorMixin], ext: str, default_path: Optional[str] = None
+) -> Callable[[Type[ConnT]], Type[ConnT]]:
+    """
+    Registers an importer for a model with a specified file extension and an optional default path.
+
+    Args:
+        target (Type[ConnectorMixin]): The model class to which the importer will be attached.
+        ext (str): The file extension associated with the model files to be imported.
+        default_path (Optional[str]): The default path where the model files are located, if any.
+
+    Returns
+    -------
+        Callable[[Type[ConnT]], Type[ConnT]]: A decorator function that registers the importer
+        to the model class.
+
+    Example:
+        @model_importer(MyModel, "hf", default_path="path/to/default")
+        class MyModelHfImporter(io.ModelConnector):
+            ...
+    """
+    return target.register_importer(ext, default_path=default_path)
+
+
+def model_exporter(
+    target: Type[ConnectorMixin], ext: str, default_path: Optional[str] = None
+) -> Callable[[Type[ConnT]], Type[ConnT]]:
+    """
+    Registers an exporter for a model with a specified file extension and an optional default path.
+
+    Args:
+        target (Type[ConnectorMixin]): The model class to which the exporter will be attached.
+        ext (str): The file extension associated with the model files to be exported.
+        default_path (Optional[str]): The default path where the model files will be saved, if any.
+
+    Returns
+    -------
+        Callable[[Type[ConnT]], Type[ConnT]]: A decorator function that registers the exporter
+        to the model class.
+
+    Example:
+        @model_exporter(MyModel, "hf", default_path="path/to/default")
+        class MyModelHFExporter(io.ModelConnector):
+            ...
+    """
+    return target.register_exporter(ext, default_path=default_path)
+
+
+def import_ckpt(
+    model: pl.LightningModule, source: str, output_path: Optional[Path] = None, overwrite: bool = False
+) -> Path:
+    """
+    Imports a checkpoint into a model using the model's associated importer, typically for
+    the purpose of fine-tuning a community model trained in an external framework, such as
+    Hugging Face. This function leverages the ConnectorMixin interface to integrate external
+    checkpoint data seamlessly into the specified model instance.
+
+    The importer component of the model reads the checkpoint data from the specified source
+    and transforms it into the right format. This is particularly useful for adapting
+    models that have been pre-trained in different environments or frameworks to be fine-tuned
+    or further developed within the current system. The function allows for specifying an output
+    path for the imported checkpoint; if not provided, the importer's default path will be used.
+    The 'overwrite' parameter enables the replacement of existing data at the output path, which
+    is useful when updating models with new data and discarding old checkpoint files.
+
+    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process
+    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter`
+    is registered under this tag via:
+    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`.
+    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints.
+    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further
+    fine-tuning.
+
+    Args:
+        model (pl.LightningModule): The model into which the checkpoint will be imported.
+            This model must implement the ConnectorMixin, which includes the necessary
+            importer method for checkpoint integration.
+        source (str): The source from which the checkpoint will be imported. This can be
+            a file path, URL, or any other string identifier that the model's importer
+            can recognize.
+        output_path (Optional[Path]): The path where the imported checkpoint will be stored.
+            If not specified, the importer's default path is used.
+        overwrite (bool): If set to True, existing files at the output path will be overwritten.
+            This is useful for model updates where retaining old checkpoint files is not required.
+
+    Returns
+    -------
+        Path: The path where the checkpoint has been saved after import. This path is determined
+            by the importer, based on the provided output_path and its internal logic.
+
+    Raises
+    ------
+        ValueError: If the model does not implement ConnectorMixin, indicating a lack of
+            necessary importer functionality.
+
+    Example:
+        model = Mistral7BModel()
+        imported_path = import_ckpt(model, "hf")
+    """
+    if not isinstance(model, ConnectorMixin):
+        raise ValueError("Model must be an instance of ConnectorMixin")
+
+    importer: ModelConnector = model.importer(source)
+    return importer(overwrite=overwrite, output_path=output_path)
+
+
+def load_connector_from_trainer_ckpt(path: Path, target: str) -> ModelConnector:
+    model: pl.LightningModule = load_ckpt(path).model
+
+    if not isinstance(model, ConnectorMixin):
+        raise ValueError("Model must be an instance of ConnectorMixin")
+
+    return model.exporter(target, path)
+
+
+def export_ckpt(
+    path: Path,
+    target: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+    load_connector: Callable[[Path, str], ModelConnector] = load_connector_from_trainer_ckpt,
+) -> Path:
+    """
+    Exports a checkpoint from a model using the model's associated exporter, typically for
+    the purpose of sharing a model that has been fine-tuned or customized within NeMo.
+    This function leverages the ConnectorMixin interface to seamlessly integrate
+    the model's state into an external checkpoint format.
+
+    The exporter component of the model reads the model's state from the specified path and
+    exports it into the format specified by the 'target' identifier. This is particularly
+    useful for adapting models that have been developed or fine-tuned within the current system
+    to be compatible with other environments or frameworks. The function allows for specifying
+    an output path for the exported checkpoint; if not provided, the exporter's default path
+    will be used. The 'overwrite' parameter enables the replacement of existing data at the
+    output path, which is useful when updating models with new data and discarding old checkpoint
+    files.
+
+    Args:
+        path (Path): The path to the model's checkpoint file from which data will be exported.
+        target (str): The identifier for the exporter that defines the format of the export.
+        output_path (Optional[Path]): The path where the exported checkpoint will be saved.
+            If not specified, the exporter's default path is used.
+        overwrite (bool): If set to True, existing files at the output path will be overwritten.
+            This is useful for model updates where retaining old checkpoint files is not required.
+        load_connector (Callable[[Path, str], ModelConnector]): A function to load the appropriate
+            exporter based on the model and target format. Defaults to `load_connector_from_trainer_ckpt`.
+
+    Returns
+    -------
+        Path: The path where the checkpoint has been saved after export. This path is determined
+            by the exporter, based on the provided output_path and its internal logic.
+
+    Raises
+    ------
+        ValueError: If the model does not implement ConnectorMixin, indicating a lack of
+            necessary exporter functionality.
+
+    Example:
+        nemo_ckpt_path = Path("/path/to/model.ckpt")
+        export_path = export_ckpt(nemo_ckpt_path, "hf")
+    """
+    exporter: ModelConnector = load_connector(path, target)
+    _output_path = output_path or Path(path) / target
+
+    return exporter(overwrite=overwrite, output_path=_output_path)
diff --git a/nemo/lightning/io/capture.py b/nemo/lightning/io/capture.py
new file mode 100644
index 000000000000..910506f13147
--- /dev/null
+++ b/nemo/lightning/io/capture.py
@@ -0,0 +1,96 @@
+import functools
+import logging
+from typing import Callable, Generic, Optional, Protocol, TypeVar, runtime_checkable
+
+import fiddle as fdl
+
+log = logging.getLogger(__name__)
+
+
+def capture(to_capture: Optional[Callable] = None):
+    if to_capture is None:
+        return lambda f: capture(f)
+
+    @functools.wraps(to_capture)
+    def wrapper(*args, **kwargs):
+        if isinstance(to_capture, IOProtocol):
+            return to_capture(*args, **kwargs)
+
+        output = to_capture(*args, **kwargs)
+        if not hasattr(output, '__dict__'):
+            try:
+                if isinstance(output, (int, float, str, tuple)):
+                    new_output = type_factory(type(output), base_value=output)
+                else:
+                    NewType = type_factory(type(output))
+                    new_output = NewType(output)
+                new_output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+                output = new_output
+            except Exception as e:
+                logging.error(f"Error creating configurable type: {e}")
+        else:
+            output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+
+        return output
+
+    return wrapper
+
+
+SelfT = TypeVar("SelfT", covariant=True)
+
+
+@runtime_checkable
+class IOProtocol(Protocol, Generic[SelfT]):
+    @property
+    def __io__(self) -> fdl.Config[SelfT]: ...
+
+
+@runtime_checkable
+class ReInitProtocol(Protocol, Generic[SelfT]):
+    def reinit(self) -> SelfT: ...
+
+
+def reinit(configurable: IOProtocol[SelfT]) -> SelfT:
+    if isinstance(configurable, ReInitProtocol):
+        return configurable.reinit()
+
+    if not hasattr(configurable, '__io__'):
+        raise ValueError(f"Cannot reinit {configurable} because it does not have a __io__ attribute")
+
+    return fdl.build(configurable.__io__)
+
+
+# Global cache for dynamically created types
+type_cache = {}
+
+
+def type_factory(original_type, base_value=None):
+    """
+    Factory function to create or retrieve from cache a new type that can have additional attributes,
+    even if the original type is immutable.
+
+    Args:
+        original_type: The type of the original output value.
+        base_value: The base value to use for immutable types, if applicable.
+
+    Returns
+    -------
+        A new type that inherits from the original type and can have additional attributes,
+        or an instance of this new type if base_value is provided.
+    """
+    type_name = f"Configurable{original_type.__name__}"
+    if type_name in type_cache:
+        NewType = type_cache[type_name]
+    else:
+        NewType = type(f"Configurable{original_type.__name__}", (original_type,), {})
+        type_cache[type_name] = NewType
+
+    if base_value is not None:
+        try:
+            instance = NewType(base_value)
+        except TypeError:
+            logging.warning(f"Could not instantiate type {NewType.__name__} with base value.")
+            instance = NewType()
+        return instance
+
+    return NewType
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
new file mode 100644
index 000000000000..cd77abf9dc1c
--- /dev/null
+++ b/nemo/lightning/io/connector.py
@@ -0,0 +1,191 @@
+import os
+import shutil
+from pathlib import Path, PosixPath, WindowsPath
+from typing import Generic, Optional, Tuple, TypeVar
+
+import pytorch_lightning as pl
+
+# Dynamically inherit from the correct Path subclass based on the operating system.
+if os.name == 'nt':
+    BasePath = WindowsPath
+else:
+    BasePath = PosixPath
+
+
+SourceT = TypeVar("SourceT")
+TargetT = TypeVar("TargetT")
+
+
+class Connector(BasePath, Generic[SourceT, TargetT]):
+    """
+    A generic connector class that provides a framework for transforming a source type (SourceT)
+    to a target type (TargetT) while handling file paths based on the operating system.
+
+    Attributes
+    ----------
+        default_path (Optional[Path]): A default path used when no path is explicitly provided.
+
+    Methods
+    -------
+        init() -> TargetT:
+            Should be implemented to initialize the target type from the source type.
+
+        apply(output_path: Path) -> Path:
+            Should be implemented to apply the transformation and save the result at the output path.
+
+        __new__(cls, *args, **kwargs) -> 'Connector':
+            Creates a new instance of the connector, using default_path if no path is provided.
+
+        __call__(output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
+            Processes the transformation and handles file operations like overwriting.
+
+        local_path(base_path: Optional[Path] = None) -> Path:
+            Computes the local path for storage based on a base path or a default cache home.
+
+        is_in_cache(base_path: Optional[Path] = None) -> bool:
+            Checks if the transformed data is already cached at the specified base path.
+    """
+
+    default_path = None
+
+    def init(self) -> TargetT:
+        raise NotImplementedError()
+
+    def apply(self, output_path: Path) -> Path:
+        raise NotImplementedError()
+
+    def __new__(cls, *args, **kwargs):
+        if cls.default_path is not None and not args and 'path' not in kwargs:
+            # If default_path is set and no arguments are provided, use default_path as the argument
+            return super().__new__(cls, cls.default_path)
+
+        return super().__new__(cls, *args, **kwargs)
+
+    def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
+        _output_path = output_path or self.local_path()
+
+        if overwrite and _output_path.exists():
+            shutil.rmtree(_output_path)
+
+        if not _output_path.exists():
+            to_return = self.apply(_output_path)
+            _output_path = to_return or _output_path
+
+        return _output_path
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        if base_path:
+            _base = base_path
+        else:
+            from nemo.lightning.base import NEMO_CACHE_HOME
+
+            _base = Path(NEMO_CACHE_HOME)
+
+        return _base / str(self).replace("://", "/")
+
+    def is_in_cache(self, base_path: Optional[Path] = None) -> bool:
+        return self.local_path(base_path=base_path).exists()
+
+
+class ModelConnector(Connector, Generic[SourceT, TargetT]):
+    """
+    A specialized connector that extends the generic Connector to handle model-specific operations
+    such as setup, save, and load using the Lightning framework.
+
+    Methods
+    -------
+        nemo_setup(model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+            Sets up the model and trainer using a specified strategy, preparing it for training or inference.
+
+        nemo_save(output_path: Path, trainer: pl.Trainer):
+            Saves the model's state to the specified path using the trainer's current strategy.
+
+        nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]:
+            Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
+    """
+
+    def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+        """
+        Sets up the model and trainer using a specified strategy, preparing it for training or inference.
+
+        Args:
+            model (pl.LightningModule): The model to be set up.
+            trainer (Optional[pl.Trainer]): The trainer to be used, if not provided a new one will be created.
+
+        Returns
+        -------
+            pl.Trainer: The trainer configured with the model and strategy.
+        """
+        from nemo.lightning import MegatronStrategy, Trainer
+
+        _trainer = trainer or Trainer(
+            devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False)
+        )
+
+        _trainer.strategy.connect(model)
+        _trainer.strategy.setup_environment()
+
+        if not model.state_dict():
+            _trainer.strategy.lazy_init = True
+            with _trainer.init_module():
+                model.configure_model()
+
+        return _trainer
+
+    def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
+        """
+        Saves the model's state to the specified path using the trainer's current strategy.
+
+        Args:
+            output_path (Path): The path where the model checkpoint will be saved.
+            trainer (pl.Trainer): The trainer with the strategy to save the model.
+        """
+        trainer.strategy.setup(trainer)
+        trainer.save_checkpoint(output_path)
+
+    def nemo_load(
+        self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
+    ) -> Tuple[pl.LightningModule, pl.Trainer]:
+        """
+        Loads a model from the specified path.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+            trainer (Optional[pl.Trainer]): The trainer to be used, if not provided a new one will be created.
+            cpu (bool): If True, the model will be loaded with a CPU-focused strategy.
+
+        Returns
+        -------
+            Tuple[pl.LightningModule, pl.Trainer]: The loaded model and the trainer configured with the model.
+        """
+        from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib
+        from nemo.lightning.io.api import load_ckpt
+
+        model = load_ckpt(path).model
+        _trainer = trainer or Trainer(devices=1, accelerator="cpu" if cpu else "gpu", strategy=MegatronStrategy())
+
+        _trainer.strategy.connect(model)
+        _trainer.strategy.setup_environment()
+        # TODO: Fix cpu initialization
+        if not model.state_dict():
+            if cpu:
+                # TODO: Make this more generic
+                with _strategy_lib.megatron_cpu_init_context(model.config):
+                    model.configure_model()
+            else:
+                model.configure_model()
+
+        _trainer.strategy.setup(_trainer)
+        _trainer.strategy.load_checkpoint(path)
+
+        return model, _trainer
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        if base_path:
+            _base = base_path
+        else:
+            from nemo.lightning.base import NEMO_MODELS_CACHE
+
+            _base = Path(NEMO_MODELS_CACHE)
+
+        return _base / str(self).replace("://", "/")
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
new file mode 100644
index 000000000000..b5ee76a2fe03
--- /dev/null
+++ b/nemo/lightning/io/mixin.py
@@ -0,0 +1,321 @@
+import functools
+import inspect
+from dataclasses import is_dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Type, TypeVar, Union
+
+import fiddle as fdl
+from cloudpickle import dump
+from typing_extensions import Self
+
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.connector import ModelConnector
+
+ConnT = TypeVar('ConnT', bound=ModelConnector)
+
+
+class IOMixin:
+    """
+    A mixin class designed to capture the arguments passed to the `__init__` method,
+    facilitating the re-creation of the object through `io.reinit` method using stored configurations.
+
+    This class intercepts the initialization of an object to store the arguments in a configuration
+    object, which can be serialized and later used to reinitialize the object to its original state.
+    It utilizes `fdl.Config` from the Fiddle library to create a structured configuration object
+    that holds the initialization parameters. This configuration object is crucial for enabling
+    serialization and deserialization of the parameters, thus allowing the object to be reconstructed
+    at a later time with the same initial state.
+
+    Attributes
+    ----------
+        __io__ (fdl.Config[Self]): A configuration object that stores the captured initialization
+        parameters in a structured format. This object is an instance of `fdl.Config`, which allows
+        for the serialization and deserialization of the parameters, enabling the object to be
+        reconstructed at a later time with the same initial state.
+
+    Examples
+    --------
+        from nemo.lightning import io
+
+        class ExampleClass(io.IOMixin):
+            def __init__(self, param1, param2):
+                super().__init__()
+                self.param1 = param1
+                self.param2 = param2
+
+        # Creating an instance of ExampleClass
+        example = ExampleClass('value1', 'value2')
+        example_copy = io.reinit(example)
+
+
+    Note:
+        For more information on `fdl.Config`, refer to the Fiddle library documentation at
+        [Fiddle Config Documentation](https://fiddle.readthedocs.io/en/latest/api_reference/core.html#config).
+
+    """
+
+    __io__ = fdl.Config[Self]
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Overrides the default object creation process to wrap the `__init__` method, allowing
+        initialization arguments to be captured and stored in the `__io__` attribute.
+
+        Args:
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            The newly created object instance.
+        """
+        original_init = cls.__init__
+
+        @functools.wraps(original_init)
+        def wrapped_init(self, *args, **kwargs):
+            cfg_kwargs = self.io_transform_args(original_init, *args, **kwargs)
+            self.__io__ = self.io_init(**cfg_kwargs)
+            original_init(self, *args, **kwargs)
+
+        cls.__init__ = wrapped_init
+        output = object().__new__(cls)
+
+        return output
+
+    def io_transform_args(self, init_fn, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Transforms and captures the arguments passed to the `__init__` method, filtering out
+        any arguments that are instances of `IOProtocol` or are dataclass fields with default
+        factories.
+
+        Args:
+            init_fn (Callable): The original `__init__` method of the class.
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            Dict[str, Any]: A dictionary of the captured and transformed arguments.
+        """
+        sig = inspect.signature(init_fn)
+        bound_args = sig.bind_partial(self, *args, **kwargs)
+        bound_args.apply_defaults()
+        config_kwargs = {k: v for k, v in bound_args.arguments.items() if k != "self"}
+
+        to_del = []
+        for key in config_kwargs:
+            if isinstance(config_kwargs[key], IOProtocol):
+                config_kwargs[key] = config_kwargs[key].__io__
+            if is_dataclass(self):
+                # Check if the arg is a factory (dataclasses.field)
+                if config_kwargs[key].__class__.__name__ == "_HAS_DEFAULT_FACTORY_CLASS":
+                    to_del.append(key)
+
+        for key in to_del:
+            del config_kwargs[key]
+
+        return config_kwargs
+
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        """
+        Initializes the configuration object (`__io__`) with the captured arguments.
+
+        Args:
+            **kwargs: A dictionary of arguments that were captured during object initialization.
+
+        Returns
+        -------
+            fdl.Config[Self]: The initialized configuration object.
+        """
+        return fdl.Config(type(self), **kwargs)
+
+    def io_dump(self, output: Path):
+        """
+        Serializes the configuration object (`__io__`) to a file, allowing the object state to be
+        saved and later restored.
+
+        Args:
+            output (Path): The path to the file where the configuration object will be serialized.
+        """
+        config_path = Path(output) / "io.pkl"
+        with open(config_path, "wb") as f:
+            dump(self.__io__, f)
+
+
+class ConnectorMixin:
+    """
+    A mixin class that provides methods to register and retrieve model connectors for importing
+    and exporting models. This class supports dynamic registration of connectors based on file
+    extensions, which facilitates the customization and extension of model serialization and
+    deserialization processes.
+
+    Attributes
+    ----------
+        _IMPORTERS (Dict[str, Type[ModelConnector]]): A dictionary mapping file extensions to
+            model connector classes that handle the import process.
+        _EXPORTERS (Dict[str, Type[ModelConnector]]): A dictionary mapping file extensions to
+            model connector classes that handle the export process.
+    """
+
+    _IMPORTERS: Dict[str, Type[ModelConnector]] = {}
+    _EXPORTERS: Dict[str, Type[ModelConnector]] = {}
+
+    @classmethod
+    def import_from(cls, path: str) -> Self:
+        """
+        Creates an instance of a model by using the appropriate importer based on the file
+        extension of the provided path.
+
+        Args:
+            path (str): The path to the model file to be imported.
+
+        Example:
+            from nemo.collections import llm
+            model = llm.Mistral7BModel.import_from("hf")
+
+        Returns
+        -------
+            Self: An instance of the model initialized from the imported data.
+        """
+        output = cls._get_connector(path).init()
+        output.ckpt_path = output.import_ckpt_path(path)
+
+        return output
+
+    @classmethod
+    def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Callable[[Type[ConnT]], Type[ConnT]]:
+        """
+        A class method decorator to register a model connector as an importer for a specific file
+        extension.
+
+        Args:
+            ext (str): The file extension to associate with the model connector.
+            default_path (Optional[str]): The default path to use if no path is specified during import.
+
+        Returns
+        -------
+            Callable[[Type[ConnT]], Type[ConnT]]: The decorator that registers the model connector.
+        """
+
+        def decorator(connector: Type[ConnT]) -> Type[ConnT]:
+            cls._IMPORTERS[ext] = connector
+            if default_path:
+                connector.default_path = default_path
+            return connector
+
+        return decorator
+
+    @classmethod
+    def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Callable[[Type[ConnT]], Type[ConnT]]:
+        """
+        A class method decorator to register a model connector as an exporter for a specific file
+        extension.
+
+        Args:
+            ext (str): The file extension to associate with the model connector.
+            default_path (Optional[str]): The default path to use if no path is specified during export.
+
+        Returns
+        -------
+            Callable[[Type[ConnT]], Type[ConnT]]: The decorator that registers the model connector.
+        """
+
+        def decorator(connector: Type[ConnT]) -> Type[ConnT]:
+            cls._EXPORTERS[ext] = connector
+            if default_path:
+                connector.default_path = default_path
+            return connector
+
+        return decorator
+
+    @classmethod
+    def importer(cls, path: str) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector for importing based on the extension of the
+        provided path.
+
+        Args:
+            path (str): The path to the model file to be imported.
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the import.
+        """
+        return cls._get_connector(path, importer=True)
+
+    @classmethod
+    def exporter(cls, ext: str, path: Union[str, Path]) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector for exporting based on the extension.
+
+        Args:
+            ext (str): The file extension associated with the model connector.
+            path (Union[str, Path]): The path where the model will be exported.
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the export.
+        """
+        return cls._get_connector(ext, path, importer=False)
+
+    def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None) -> Path:
+        """
+        Imports a checkpoint from a specified path, potentially overwriting existing files.
+
+        Args:
+            path (str): The path to the checkpoint file to be imported.
+            overwrite (bool): Flag to determine if existing files should be overwritten (default is False).
+            base_path (Optional[Path]): The base path where the checkpoint file is located; used to resolve
+                                        relative paths.
+
+        Returns
+        -------
+            Path: The path to the imported checkpoint.
+
+        Raises
+        ------
+            FileNotFoundError: If the checkpoint file does not exist at the specified path.
+        """
+        connector = self._get_connector(path)
+        ckpt_path: Path = connector.local_path(base_path=base_path)
+        ckpt_path = connector(ckpt_path, overwrite=overwrite)
+
+        return ckpt_path
+
+    @classmethod
+    def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector based on the file extension and path,
+        distinguishing between importers and exporters.
+
+        Args:
+            ext (str): The file extension or a URI that may include a protocol specifier.
+            path (Optional[Union[str, Path]]): The path where the model file is located or will be saved.
+            importer (bool): Flag to determine if the connector is for importing (True) or exporting (False).
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the import or export.
+
+        Raises
+        ------
+            ValueError: If no connector is found for the specified extension or if no default path is provided
+                        when required.
+        """
+        _path = None
+        if "://" in ext:
+            ext, _path = ext.split("://")
+        else:
+            _path = path
+
+        connector = cls._IMPORTERS.get(ext) if importer else cls._EXPORTERS.get(ext)
+        if not connector:
+            raise ValueError(f"No connector found for extension '{ext}'")
+
+        if not _path:
+            if not connector.default_path:
+                raise ValueError(f"No default path specified for extension '{ext}'. ", "Please provide a path")
+
+            return connector()
+
+        return connector(_path)
diff --git a/nemo/io/pl.py b/nemo/lightning/io/pl.py
similarity index 79%
rename from nemo/io/pl.py
rename to nemo/lightning/io/pl.py
index 659ef0d6621b..fba94f5e3a55 100644
--- a/nemo/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -1,6 +1,7 @@
 import logging
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, Protocol, TypeVar, Union
 
 import pytorch_lightning as pl
 import torch
@@ -8,8 +9,13 @@
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.types import _PATH
 from torch import nn
-from typing_extensions import override
+from typing_extensions import Self, override
 
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.mixin import IOMixin
+
+if TYPE_CHECKING:
+    from nemo.lightning.pytorch.strategies import MegatronStrategy
 
 log = logging.getLogger(__name__)
 
@@ -18,6 +24,40 @@
 ModuleT = TypeVar("ModuleT", bound=nn.Module)
 
 
+@dataclass
+class TrainerCheckpoint(IOMixin, Generic[LightningModuleT]):
+    model: LightningModuleT
+    trainer: pl.Trainer
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
+        if not isinstance(strategy.trainer, IOProtocol):
+            raise ValueError(f"Trainer must be an instance of {IOProtocol}. Please use the Trainer from nemo.")
+
+        if not isinstance(strategy.lightning_module, IOProtocol):
+            raise ValueError("LightningModule must extend IOMixin.")
+
+        return cls(trainer=strategy.trainer, model=strategy.lightning_module, extra=cls.construct_extra(strategy))
+
+    @classmethod
+    def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]:
+        extra = {}
+        if hasattr(strategy.trainer, "datamodule") and isinstance(strategy.trainer.datamodule, IOProtocol):
+            extra["datamodule"] = strategy.trainer.datamodule.__io__
+
+        # TODO: Add optimizer to extra
+
+        return extra
+
+
+class TrainerCkptProtocol(Protocol):
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self: ...
+
+    def io_dump(self, output: Path): ...
+
+
 class MegatronCheckpointIO(CheckpointIO):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
@@ -54,7 +94,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
             logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
             return
-
         fs.makedirs(checkpoint_dir, exist_ok=True)
         dist_checkpointing.save(sharded_state_dict=checkpoint, checkpoint_dir=str(checkpoint_dir))
 
@@ -113,7 +152,6 @@ def _fix_tensors_device(ckpt: Dict) -> Dict:
     """Ensure checkpoint tensors are on the correct device."""
     assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
     cur_dev = torch.device("cuda", index=torch.cuda.current_device())
-
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
 
     def _fix_device(t):
@@ -130,7 +168,6 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     to be used as a directory for distributed checkpoints.
     """
     filepath = Path(filepath)
-
     if not filepath.suffix == ".ckpt":
         filepath = filepath.with_suffix(filepath.suffix + ".ckpt")
 
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
new file mode 100644
index 000000000000..ed481cfcfe08
--- /dev/null
+++ b/nemo/lightning/io/state.py
@@ -0,0 +1,407 @@
+import inspect
+import re
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union, overload
+
+import numpy as np
+from torch import nn
+
+SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
+TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+@dataclass
+class TransformCTX:
+    source: nn.Module
+    source_state: dict
+    target: nn.Module
+    target_state: dict
+
+
+def apply_transforms(
+    source: nn.Module,
+    target: TargetModuleT,
+    mapping: Dict[str, str],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+) -> TargetModuleT:
+    """
+    Applies a series of transformations to adapt the state dictionary of a source module to
+    match the structure of a target module's state dictionary.
+
+    This function renames keys according to a provided mapping and modifies values using a list
+    of transformation functions. Each transformation function typically is decorated
+    with `io.state_transform`.
+
+    Args:
+        source (nn.Module): The source module from which parameters and buffers are taken.
+        target (TargetModuleT): The target module to which parameters and buffers are adapted.
+        mapping (Dict[str, str]): Key-value pairs where each key from the source state dictionary
+            is mapped to a corresponding key in the target state dictionary.
+        transforms (Optional[List[Callable[[TransformCTX], TransformCTX]]]): A list of functions
+            that modify the `TransformCTX` object. If None, no transformations beyond key renaming
+            are applied. Defaults to None.
+
+    Returns
+    -------
+        TargetModuleT: The modified target module with its state dictionary adjusted according to
+        the specified mappings and transformations.
+
+    Raises
+    ------
+        ValueError: If there's a mismatch in shape between corresponding source and target parameters
+            or buffers.
+        RuntimeError: If the target state dictionary contains keys that are not present in the source
+            state dictionary after all transformations.
+
+    Examples
+    --------
+        >>> source_module = nn.Linear(10, 5)
+        >>> target_module = nn.Linear(10, 5)
+        >>> mapping = {'weight': 'weights', 'bias': 'biases'}
+        @io.state_transform(
+            source_key="weight",
+            target_key="weights"
+        )
+        def scale_weights(ctx):
+            ctx.target_state['weights'] = ctx.source_state['weight'] * 2
+            return ctx
+        >>> transformed_target = apply_transforms(
+        ...     source_module, target_module, mapping, [scale_weights]
+        ... )
+        >>> print(transformed_target.state_dict()['weights'])
+
+    See Also
+    --------
+        - `TransformCTX`: For more details on the context object used in transformations.
+        - `StateDictTransform`: For creating complex transformations.
+
+    Note:
+        This function is particularly useful when adapting models from different frameworks or
+        when consolidating models with different architectural changes.
+    """
+    from megatron.core.transformer.module import MegatronModule
+
+    # TODO: How can we improve this?
+    _source = source
+    if hasattr(source, "module") and isinstance(source.module, MegatronModule):
+        _source = source.module
+    _target = target
+    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
+        _target = target.module
+
+    target_state = _target.state_dict()
+    ctx = TransformCTX(
+        source=_source,
+        source_state=_source.state_dict(),
+        target=_target,
+        target_state=target_state,
+    )
+
+    for key, val in mapping.items():
+        ctx = StateDictTransform(key, val)(ctx)
+
+    if transforms:
+        for transform in transforms:
+            ctx = transform(ctx)
+
+    _params: Dict[str, nn.Parameter] = {}
+    for name, param in _target.named_parameters():
+        if name in target_state:
+            target_param = target_state[name]
+            if param.data.shape != target_param.shape:
+                raise ValueError(f"Shape mismatch for parameter {name}: {param.shape} vs {target_param.shape}")
+
+            _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
+            target_state.pop(name)
+        else:
+            print(f"Unexpected key: {name} not in checkpoint but in model.")
+
+    for key, val in _params.items():
+        _module, _key = _target, key
+        if "." in key:
+            for part in key.split(".")[:-1]:
+                _module = getattr(_module, part)
+            _key = key.split(".")[-1]
+
+        _module.register_parameter(_key, val)
+
+    _buffers = {}
+    for name, buffer in _target.named_buffers():
+        if name in target_state:
+            if buffer.shape != target_state[name].shape:
+                raise ValueError(f"Shape mismatch for buffer {name}: {buffer.shape} vs {target_state[name].shape}")
+
+            _buffers[name] = nn.Parameter(target_state[name], requires_grad=False)
+            target_state.pop(name)
+
+    for key, val in _buffers.items():
+        _module, _key = _target, key
+        if "." in key:
+            for part in key.split(".")[:-1]:
+                _module = getattr(_module, part)
+            _key = key.split(".")[-1]
+
+        _module.register_buffer(_key, val)
+
+    keys = [name for name in list(target_state.keys()) if not name.endswith("_extra_state")]
+    if len(keys) != 0:
+        raise RuntimeError(f"Additional keys: {target_state.keys()} in checkpoint but not in model.")
+
+    # TODO: Is this correct?
+    # for key in target.state_dict():
+    #     if key.endswith("_extra_state"):
+    #         del target.state_dict()[key]
+
+    """finally:
+        cls._set_model_restore_state(is_being_restored=False)"""
+
+    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
+        target.module = _target
+
+        return target
+
+    return _target
+
+
+def _default_transform(inp):
+    return inp.float()
+
+
+class StateDictTransform(Generic[F]):
+    """
+    A transformation class for state dictionaries, allowing for flexible key matching and
+    transformation of values between source and target state dictionaries.
+
+    Attributes
+    ----------
+        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
+            state dictionary to match. Wildcards (*) are supported.
+        target_key: A string or tuple of strings specifying the keys in the target state dictionary
+            to match. Wildcards (*) are supported.
+        transform: A callable that performs the transformation on matched keys' values.
+
+    Examples
+    --------
+        >>> def example_transform(ctx, *args):
+        ...     return sum(args)
+        >>> transform = StateDictTransform(
+        ...     source_key="model.layers.*.self_attn.*_proj.weight",
+        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+        ...     transform=example_transform
+        ... )
+    """
+
+    def __init__(
+        self,
+        source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+        target_key: Union[str, Tuple[str, ...]],
+        transform: F = _default_transform,
+    ):
+        self.source_key = source_key
+        self.target_key = target_key
+        self.transform = transform
+
+    def __call__(self, ctx: TransformCTX) -> TransformCTX:
+        source_key = self.source_key
+        target_key = self.target_key
+        source_dict, target_dict = ctx.source_state, ctx.target_state
+
+        fn_params = dict(inspect.signature(self.transform).parameters)
+        fn_params.pop("ctx", None)
+
+        if isinstance(source_key, (dict, tuple)):
+            if isinstance(source_key, tuple):
+                source_key_dict = {param: source_key[i] for i, param in enumerate(fn_params)}
+            else:
+                source_key_dict = source_key
+            source_matches_dict = {k: _match_keys(list(source_dict.keys()), v) for k, v in source_key_dict.items()}
+            target_matches = _match_keys(list(target_dict.keys()), target_key)
+
+            for target_index, target_match in np.ndenumerate(target_matches):
+                kwargs = {}
+                for param in fn_params:
+                    if param in source_matches_dict:
+                        source_match = source_matches_dict[param][target_index[:-1]]
+                        kwargs[param] = source_dict[source_match[target_index]]
+
+                target_dict[target_match] = self.call_transform(ctx, **kwargs)
+        else:
+            source_keys = list(source_dict.keys())
+            target_keys = list(target_dict.keys())
+
+            source_matches = _match_keys(source_keys, source_key)
+            if source_matches.size == 1 and source_matches == np.array(None):
+                raise ValueError(f"No matches found for source key: {source_key}")
+
+            if isinstance(target_key, str):
+                target_matches = _match_keys(target_keys, target_key)
+                if target_matches.size < 1:
+                    raise ValueError(f"No matches found for target key: {target_key}")
+            else:
+                if isinstance(target_key, dict):
+                    raise ValueError("Target key must be a string or a tuple of strings.")
+
+                _matches = np.vstack([_match_keys(target_keys, key) for key in target_key])
+                target_matches = np.transpose(_matches)
+
+            # Determine if we are dealing with multiple source matches or multiple target matches
+            multiple_sources = source_matches.ndim >= target_matches.ndim
+            accepts_var_args = any(
+                param.kind == param.VAR_POSITIONAL for param in inspect.signature(self.transform).parameters.values()
+            )
+
+            if multiple_sources:
+                for target_index, target_match in np.ndenumerate(target_matches):
+                    source_match = source_matches[target_index]
+
+                    if accepts_var_args:
+                        source_values = [source_dict[k] for k in source_match]
+                        target_dict[target_match] = self.call_transform(ctx, *source_values)
+                    else:
+                        _source_match_list = [source_match] if isinstance(source_match, str) else list(source_match)
+                        if len(fn_params) != len(_source_match_list):
+                            raise ValueError(
+                                f"Mismatch between source and target keys: {source_match} vs {target_match}"
+                            )
+
+                        kwargs = {param: source_dict[k] for param, k in zip(fn_params, _source_match_list)}
+                        target_dict[target_match] = self.call_transform(ctx, **kwargs)
+            else:
+                if source_matches.ndim == 0:
+                    source_matches_list = [source_matches.item()]
+                    source_matches = np.array(source_matches_list, dtype=object)
+                else:
+                    source_matches_list = list(source_matches)
+
+                if source_matches.shape[0] != target_matches.shape[0]:
+                    if target_matches.shape[0] == 1 and source_matches.shape[0] == target_matches.shape[1]:
+                        source_matches_list = [source_matches_list]
+                    else:
+                        raise ValueError(
+                            "Mismatch between source and target keys: {source_matches} vs {target_matches}"
+                        )
+
+                for source_index, source_match in enumerate(source_matches_list):
+                    target_match = target_matches[source_index]
+                    source_values = (
+                        [source_dict[source_match]]
+                        if np.isscalar(source_match)
+                        else [source_dict[k] for k in source_match]
+                    )
+                    if accepts_var_args:
+                        outputs = self.call_transform(ctx, *source_values)
+                    else:
+                        kwargs = {param: val for param, val in zip(fn_params, source_values)}
+                        outputs = self.call_transform(ctx, **kwargs)
+
+                    if isinstance(target_match, str):
+                        target_dict[target_match] = outputs
+                    else:
+                        for i, t in enumerate(outputs):
+                            target_dict[target_match[i]] = t
+
+        return ctx
+
+    def call_transform(self, ctx: TransformCTX, *args, **kwargs):
+        func_params = inspect.signature(self.transform).parameters
+        expected_num_args = len([p for p in func_params if p not in ['self', 'ctx']])
+        provided_num_args = len(args) + len(kwargs)
+        accepts_var_args = any(param.kind == param.VAR_POSITIONAL for param in func_params.values())
+
+        if not accepts_var_args and provided_num_args != expected_num_args:
+            raise ValueError(
+                f"Expected {expected_num_args} arguments for the transformation function, but got {provided_num_args}."
+            )
+
+        if 'ctx' in func_params:
+            return self.transform(ctx, *args, **kwargs)
+
+        return self.transform(*args, **kwargs)
+
+
+def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
+    regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+    wildcard_matches = [[] for _ in range(pattern.count("*"))]
+
+    for key in keys:
+        match = regex_pattern.match(key)
+        if match:
+            for i, group in enumerate(match.groups()):
+                if group not in wildcard_matches[i]:
+                    wildcard_matches[i].append(group)
+
+    # Sort the wildcard matches to maintain consistent ordering
+    for i in range(len(wildcard_matches)):
+        wildcard_matches[i].sort(key=lambda x: int(x) if x.isdigit() else x)
+
+    # Determine the shape of the output array based on the unique matches for each wildcard
+    shape = [len(matches) for matches in wildcard_matches]
+
+    # Initialize an empty array with the determined shape
+    output_array = np.empty(shape, dtype=object)
+
+    # Populate the array with the keys, now that we have the correct shape and ordering
+    for key in keys:
+        match = regex_pattern.match(key)
+        if match:
+            # Convert match groups to indices based on their position in wildcard_matches
+            indices = [wildcard_matches[i].index(group) for i, group in enumerate(match.groups())]
+            output_array[tuple(indices)] = key  # Place the key in the array based on the indices
+
+    return output_array
+
+
+@overload
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+    target_key: Union[str, Tuple[str, ...]],
+) -> Callable[[F], StateDictTransform[F]]: ...
+
+
+@overload
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F
+) -> StateDictTransform[F]: ...
+
+
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+    target_key: Union[str, Tuple[str, ...]],
+    fn: Optional[F] = None,
+):
+    """
+    A decorator for creating StateDictTransform instances with specified source and target keys,
+    and a transformation function. This allows for concise definition of state dictionary
+    transformations.
+
+    Args:
+        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
+            state dictionary to match. Wildcards (*) are supported.
+        target_key: A string or tuple of strings specifying the keys in the target state dictionary
+            to match. Wildcards (*) are supported.
+        fn: An optional callable that performs the transformation on matched keys' values. If not
+            provided, the decorator can be used to wrap a function definition.
+
+    Returns
+    -------
+        A StateDictTransform instance if `fn` is provided, otherwise returns a decorator that
+        takes a function and returns a StateDictTransform instance.
+
+    Examples
+    --------
+        >>> @state_transform(
+        ...     source_key="model.layers.*.self_attn.*_proj.weight",
+        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+        ... )
+        ... def sum_transform(ctx, *args):
+        ...     return sum(args)
+    """
+
+    def wrapper(fn) -> StateDictTransform:
+        return StateDictTransform(source_key, target_key, fn)
+
+    if fn is None:
+        return wrapper
+
+    return wrapper(fn)
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 899f2fb2c06c..5955276eda56 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -31,11 +31,9 @@
 
 @runtime_checkable
 class PrecisionPluginProtocol(Protocol[DataT]):
-    def convert_input(self, data: DataT) -> DataT:
-        ...
+    def convert_input(self, data: DataT) -> DataT: ...
 
-    def convert_output(self, output: torch.Tensor) -> torch.Tensor:
-        ...
+    def convert_output(self, output: torch.Tensor) -> torch.Tensor: ...
 
 
 def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT:
@@ -122,7 +120,7 @@ def __init__(
 
         if vp_size is not None:
             if len(_pipeline) == 1 and parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                from nemo import io
+                from nemo.lightning import io
 
                 parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size)
                 for i in range(1, vp_size):
@@ -202,7 +200,7 @@ def forward(
         _forward_step = forward_step or self.forward_step
         _loss_reduction = loss_reduction or self.loss_reduction
         _micro_batch_size: int = micro_batch_size or self.infer_micro_batch_size(data)
-        _seq_length: int = seq_length or self.infer_seq_lenght(data)
+        _seq_length: int = seq_length or self.infer_seq_length(data)
         _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data)
 
         pipeline = self.pipeline
@@ -212,7 +210,10 @@ def forward(
         if wrap_forward_step:
             _data_step = data_step or self.data_step
             forward_step_func = self.wrapped_forward_step(
-                _forward_step, data_step=_data_step, loss_reduction=loss_reduction, context=context,
+                _forward_step,
+                data_step=_data_step,
+                loss_reduction=_loss_reduction,
+                context=context,
             )
         else:
             forward_step_func = _forward_step
@@ -259,7 +260,11 @@ def forward(
         return loss_mean
 
     def wrapped_forward_step(
-        self, forward_step, loss_reduction, context, data_step,
+        self,
+        forward_step,
+        loss_reduction,
+        context,
+        data_step,
     ) -> Callable[[nn.Module, DataT], Tuple[torch.Tensor, "MegatronCallbackProtocol"]]:
         """The method wraps the forward step function and returns a callable.
 
@@ -309,7 +314,11 @@ def wrapped_forward_step_func(dataloader_iter, model):
 
             # callback
             self._setup_module(
-                forward_callback, batch=batch, model=self, forward_module=model, tensor=output_tensor,
+                forward_callback,
+                batch=batch,
+                model=self,
+                forward_module=model,
+                tensor=output_tensor,
             )
 
             if self.precision_plugin and parallel_state.is_pipeline_last_stage():
@@ -387,7 +396,7 @@ def infer_micro_batch_size(self, data: Union[DataT, Iterator[DataT], List[Iterat
 
         raise ValueError("Cannot infer `micro_batch_size` from data, please specify it manually")
 
-    def infer_seq_lenght(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
+    def infer_seq_length(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
         if hasattr(data, "seq_length"):
             return data.seq_length
         if hasattr(data, "data_config"):
@@ -397,10 +406,10 @@ def infer_seq_lenght(self, data: Union[DataT, Iterator[DataT], List[Iterator[Dat
             # TODO: Check if at least 2 dims
             return data.size(1)
         elif isinstance(data, dict):
-            return self.infer_seq_lenght(next(iter(data.values())))
+            return self.infer_seq_length(next(iter(data.values())))
         elif isinstance(data, (list, tuple)) and len(data) > 0:
             _tensor: Tensor = data[0]
-            return self.infer_seq_lenght(_tensor)
+            return self.infer_seq_length(_tensor)
 
         raise ValueError("Cannot infer `seq_length` from data, please specify it manually")
 
@@ -728,29 +737,21 @@ def __contains__(self, callback_object) -> bool:
 
 
 class CallbackMethods:
-    def on_megatron_step_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_log_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_log_step_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_end(self, *args, **kwargs) -> None: ...
 
 
 ReductionT = TypeVar("ReductionT")
@@ -778,8 +779,7 @@ def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> torch.
 
 @runtime_checkable
 class MegatronCallbackProtocol(Protocol):
-    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        ...
+    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: ...
 
 
 @runtime_checkable
@@ -796,8 +796,7 @@ def __call__(
         decoder_seq_length: Optional[int] = None,
         forward_only: bool = False,
         collect_non_loss_data: bool = False,
-    ) -> list:
-        ...
+    ) -> list: ...
 
 
 def _calc_number_of_params(model: List[nn.Module]) -> int:
diff --git a/nemo/lightning/pytorch/plugins/__init__.py b/nemo/lightning/pytorch/plugins/__init__.py
index 45f88a383681..d99e1a3ca7b9 100644
--- a/nemo/lightning/pytorch/plugins/__init__.py
+++ b/nemo/lightning/pytorch/plugins/__init__.py
@@ -1,3 +1,7 @@
 from nemo.lightning.pytorch.plugins.data_sampler import MegatronDataSampler
+from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision
 
-__all__ = ["MegatronDataSampler"]
+__all__ = [
+    "MegatronDataSampler",
+    "MegatronMixedPrecision",
+]
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
new file mode 100644
index 000000000000..6c3d556816d2
--- /dev/null
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+from typing import Any, Callable, Generator, List, Literal, Tuple, TypeVar, Union
+
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.plugins.precision import MixedPrecision
+from torch.nn import Module
+from torch.optim import Optimizer
+
+from nemo.lightning._strategy_lib import GradScaler
+
+AnyT = TypeVar("AnyT")
+
+
+class MegatronMixedPrecision(MixedPrecision):
+    def __init__(
+        self,
+        precision: Literal["16-mixed", "bf16-mixed"],
+        amp_O2: bool = False,
+        device="cuda",
+    ) -> None:
+        if precision == "bf16-mixed":
+            scaler = None
+        else:
+            scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
+
+        super().__init__(precision, device, scaler)
+
+        # MixedPrecisionPlugin class in PTL >= 2.0 takes only "16-mixed" or "bf16-mixed" for precision arg
+        if precision == "16-mixed":
+            dtype = torch.float16
+
+            def float16_convertor(val):
+                return val.half()
+
+        elif precision == "bf16-mixed":
+            dtype = torch.bfloat16
+
+            def float16_convertor(val):
+                return val.bfloat16()
+
+        else:
+            raise ValueError("precision must be '16-mixed' or 'bf16-mixed'")
+
+        self.dtype = dtype
+        torch.set_autocast_gpu_dtype(dtype)
+        self.float16_convertor = float16_convertor
+        self.amp_O2 = amp_O2
+
+    def connect(
+        self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
+    ) -> Tuple[Module, List[Optimizer], List[Any]]:
+        """Connects this plugin to the accelerator and the training process."""
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper):
+            return model, optimizers, lr_schedulers
+
+        _optimizers = [*optimizers]
+        _optimizers[0] = self.convert_optimizer(_optimizers[0])
+
+        return model, _optimizers, lr_schedulers
+
+    def convert_module(self, module: Module) -> Module:
+        """Convert the module parameters to the precision type this plugin handles.
+
+        This is optional and depends on the precision limitations during optimization.
+
+        """
+        if self.precision == "bf16-mixed":
+            return module.bfloat16()
+        if self.precision == "16-mixed":
+            return module.half()
+
+        return module
+
+    def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        """Convert the optimizer parameters to the precision type this plugin handles.
+
+        This is optional and depends on the precision limitations during optimization.
+
+        """
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
+            return optimizer
+
+        return MainParamsOptimizerWrapper(
+            optimizer,
+            fp32_grad_accum=True,
+            contiguous_grad_bucket=True,
+        )
+
+    def convert_input(self, data: AnyT) -> AnyT:
+        """Convert model inputs (forward) to the floating point precision type of this plugin.
+
+        Note: MegatronStrategy will take care of only doing this when:
+            parallel_state.is_pipeline_first_stage()
+
+        """
+        from megatron.core.transformer.module import fp32_to_float16
+
+        return fp32_to_float16(data, self.float16_convertor)
+
+    def convert_output(self, data: AnyT) -> AnyT:
+        """Convert outputs to the floating point precision type expected after model's forward.
+
+        Note: MegatronStrategy will take care of only doing this when:
+            parallel_state.is_pipeline_last_stage()
+
+        """
+        from megatron.core.transformer.module import float16_to_fp32
+
+        return float16_to_fp32(data)
+
+    def optimizer_step(
+        self,
+        optimizer: torch.optim.Optimizer,
+        model: Union["pl.LightningModule", torch.nn.Module],
+        closure: Callable[[], Any],
+        **kwargs: Any,
+    ) -> None:
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper):
+            return super().optimizer_step(optimizer, model, closure, **kwargs)
+
+        if self.scaler is None:
+            assert optimizer.fp32_grad_accumulation, "BF16 uses FP32 grad accumulation"
+            _ = closure()
+            self._after_closure(model, optimizer)
+            return optimizer.step(**kwargs)
+
+        assert not optimizer.fp32_grad_accumulation, "FP16 uses FP16 grad accumulation"
+        closure_result = closure()
+
+        # TODO: Add an option for merged all-reduce
+
+        # cast fp16 grads to fp32 and copy to main grads, which are used for unscale and param update
+        optimizer.copy_model_grads_to_main_grads()
+        # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
+        # unscale main (fp32) gradients
+        self.scaler.unscale_(optimizer)
+        self._after_closure(model, optimizer)
+        skipped_backward = closure_result is None
+        # in manual optimization, the closure does not return a value
+        if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward:
+            # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
+            self.scaler.step(optimizer, **kwargs)
+            self.scaler.update()
+
+    @contextmanager
+    def forward_context(self) -> Generator[None, None, None]:
+        """No explicit precision casting. Inputs are supposed to be manually casted."""
+        try:
+            yield
+        finally:
+            pass
+
+
+__all__ = ["MegatronMixedPrecision"]
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 89cbe98cf707..c002ecf7fd68 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -27,8 +27,8 @@
 from torch.utils.data import DataLoader
 from typing_extensions import override
 
-from nemo.io.pl import MegatronCheckpointIO
-from nemo.lightning import _strategy_lib
+from nemo.lightning import _strategy_lib, io
+from nemo.lightning.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
 
@@ -38,7 +38,7 @@
 ConfigT = TypeVar("ConfigT")
 
 
-class MegatronStrategy(DDPStrategy):
+class MegatronStrategy(DDPStrategy, io.IOMixin):
     """Megatron plugin for Pytorch Lightning.
 
     Args:
@@ -60,6 +60,9 @@ def __init__(
         checkpoint_io=None,  # TODO: Add type-hint
         no_ddp_communication_hook: bool = True,
         find_unused_parameters: bool = False,
+        enable_nemo_ckpt_io: bool = True,
+        ckpt_type: TrainerCkptProtocol = TrainerCheckpoint,
+        ckpt_include_optimizer: bool = False,
         lazy_init: bool = False,
         **kwargs,
     ) -> None:
@@ -77,7 +80,10 @@ def __init__(
         self.pipeline_model_parallel_size = pipeline_model_parallel_size
         self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
         self.sequence_parallel = sequence_parallel
+        self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.ckpt_type = ckpt_type
         self.lazy_init = lazy_init
+        self.ckpt_include_optimizer = ckpt_include_optimizer
 
         # used in NVIDIA NGC PyTorch containers
         _strategy_lib.enable_nvidia_optimizations()
@@ -169,6 +175,7 @@ def setup_distributed(self) -> None:
         super().setup_distributed()
 
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         # init model parallel if needed
@@ -222,6 +229,7 @@ def configure_ddp(self) -> None:
     def _setup_model(self, model: nn.Module) -> DistributedDataParallel:
         """Only called when we need to wrap the model for pytorch's ddp."""
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         app_state = AppState()
@@ -340,12 +348,14 @@ def optimizer_sharded_state_dict(self):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        checkpoint['state_dict'] = OrderedDict([])  # remove device state_dict
-        checkpoint['sharded_state_dict'] = self.megatron_parallel.sharded_state_dict()
+        checkpoint["state_dict"] = OrderedDict([])  # remove device state_dict
+        checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
         if self.trainer.state.fn == TrainerFn.FITTING:
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+            checkpoint["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+        if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type:
+            self.ckpt_type.from_strategy(self).io_dump(ckpt_to_dir(filepath))
 
     @override
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
@@ -360,9 +370,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         sharded_state_dict = {}
         sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict()
 
-        # if self.trainer.state.fn == TrainerFn.FITTING:
-        #     if self.lightning_module.optimizers(use_pl_optimizer=False):
-        #         sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
+        if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING:
+            if self.lightning_module.optimizers(use_pl_optimizer=False):
+                sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
 
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
new file mode 100644
index 000000000000..b4483d4af4b9
--- /dev/null
+++ b/nemo/lightning/pytorch/trainer.py
@@ -0,0 +1,15 @@
+from copy import deepcopy
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from typing_extensions import Self
+
+from nemo.lightning.io.mixin import IOMixin
+
+
+class Trainer(pl.Trainer, IOMixin):
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        # Each argument of the trainer can be stateful so we copy them
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
+
+        return fdl.Config(type(self), **cfg_kwargs)
diff --git a/nemo/llm/__init__.py b/nemo/llm/__init__.py
deleted file mode 100644
index 2dd39b3f170e..000000000000
--- a/nemo/llm/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from nemo.llm.gpt.data import MockDataModule
-from nemo.llm.gpt.model import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
-
-__all__ = [
-    "MockDataModule",
-    "GPTModel",
-    "GPTConfig",
-    "gpt_data_step",
-    "gpt_forward_step",
-    "MaskedTokenLossReduction",
-]
diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py
deleted file mode 100644
index e9b7c07c16cc..000000000000
--- a/nemo/llm/gpt/data/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from nemo.llm.gpt.data.mock import MockDataModule
-
-__all__ = ["MockDataModule"]
diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py
deleted file mode 100644
index 9481e75542ed..000000000000
--- a/nemo/llm/gpt/model/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
-
-__all__ = ["GPTConfig", "GPTModel", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step"]
diff --git a/nemo/package_info.py b/nemo/package_info.py
index b253927a6b38..59805e0e04d3 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -16,7 +16,7 @@
 MAJOR = 2
 MINOR = 0
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = 'rc1'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/nemo/utils/callbacks/checkpointing_context.py b/nemo/utils/callbacks/checkpointing_context.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index 77dc33e7b567..ec0650a90e7d 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -159,7 +159,13 @@ def update_metrics(self, key, value, batch_size):
 
 
 def get_optimizer_step(state):
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -> None:
+    def optimizer_step(
+        self,
+        epoch,
+        batch_idx,
+        optimizer,
+        optimizer_closure=None,
+    ) -> None:
         # Not all optimizer supports set_to_none.
         if not hasattr(optimizer, "support_set_to_none"):
             optimizer.support_set_to_none = is_param_in_hook_signature(
@@ -175,7 +181,10 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
             with torch.cuda.stream(state.stream):
                 optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
-                    epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
+                    epoch,
+                    batch_idx,
+                    optimizer,
+                    optimizer_closure=optimizer_closure,
                 )
             torch.cuda.current_stream().wait_stream(state.stream)
 
@@ -194,7 +203,10 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
                 # `zero_grad()` being not captured.
                 optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
-                    epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
+                    epoch,
+                    batch_idx,
+                    optimizer,
+                    optimizer_closure=optimizer_closure,
                 )
             torch.cuda.synchronize()
 
@@ -270,7 +282,7 @@ def __init__(self, capture_iteration=-1):
             raise Exception("Warmup must run at least 11 DDP-enabled eager iterations before capture.")
         if torch.distributed.is_initialized():
             raise Exception("CUDAGraphCallback should be initialized before process group.")
-        os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
 
         self.state = CUDAGraphState(capture_iteration=capture_iteration)
 
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
new file mode 100644
index 000000000000..b95be90274e3
--- /dev/null
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from time import time
+from typing import Any, Dict, Optional
+
+import pytorch_lightning as pl
+from lightning_fabric.plugins import CheckpointIO
+from lightning_fabric.utilities.cloud_io import get_filesystem
+from lightning_fabric.utilities.types import _PATH
+from pytorch_lightning import Callback
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
+
+from nemo.utils import logging
+
+try:
+    from megatron.core import dist_checkpointing
+    from megatron.core.dist_checkpointing.dict_utils import extract_matching_values
+    from megatron.core.dist_checkpointing.mapping import ShardedBase
+    from megatron.core.dist_checkpointing.strategies import tensorstore
+
+    from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as IMPORT_ERROR_EXC:
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+
+
+@contextmanager
+def _debug_time(name: str):
+    """Simple context manager for timing functions/code blocks."""
+    start = time()
+    try:
+        yield
+    finally:
+        logging.debug(f'{name} took {time() - start:.3f}s')
+
+
+class AsyncCompatibleCheckpointIO(CheckpointIO, ABC):
+    """CheckpointIO that can be used together with async saving.
+
+    Differs from the regular CheckpointIO only by the `save_checkpoint`
+    return type. The `save_checkpoint` method itself is synchronous, but returns
+    callbacks that can be performed asynchronously.
+    """
+
+    @abstractmethod
+    def save_checkpoint(
+        self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None
+    ) -> 'AsyncRequest':
+        raise NotImplementedError
+
+
+class AsyncFinalizableCheckpointIO(_WrappingCheckpointIO):
+    """CheckpointIO wrapper for async checkpoint saving and synchronous finalization.
+
+    Runs main part of the checkpoint save in a separate process (not thread as the PTL
+    AsyncCheckpointIO does). Allows to perform a (synchronous) finalization
+    function after all ranks finish checkpoint saving.
+
+    NOTE: for correctness, this plugin must be used together with the
+    AsyncFinalizerCallback callback which performs the finalization checks.
+
+    Args:
+        checkpoint_io (CheckpointIO): wrapped checkpoint_io object. Must be
+            of type AsyncCompatibleCheckpointIO.
+    Requires the underlying checkpoint_io.save_checkpoint to return save_fn, save_args, finalize_fn.
+    """
+
+    def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO) -> None:
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+        if not isinstance(checkpoint_io, AsyncCompatibleCheckpointIO):
+            raise ValueError(f'Incompatible wrapped checkpoint_io type: {type(checkpoint_io)}')
+
+        super().__init__(checkpoint_io)
+        self.async_calls_queue = AsyncCallsQueue()
+
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Executes async request returned from the underlying checkpoint_io asynchronously.
+
+        Requires the underlying checkpoint_io.save_checkpoint to return an AsyncRequest.
+        It is then applied with `self.async_calls_queue` asynchronously.
+
+        Args:
+            checkpoint (Dict[str, Any]): checkpoint to save. Passed to underlying
+                checkpoint_io without modifications.
+            path (_PATH): path to save the checkpoint. Passed to underlying
+                checkpoint_io without modifications.
+            storage_options (Any, optional): storage control modifiers. This class
+                consumed the `finalize_fn` parameter (if any), which is expected to be
+                a callback and is appended to async finalization functions.
+
+        Applies underlying checkpoint_io finalize callback first, then the external one (postfix order).
+        """
+        external_finalize_fn = (storage_options or {}).pop('finalize_fn', None)
+        assert isinstance(self.checkpoint_io, AsyncCompatibleCheckpointIO), type(self.checkpoint_io)
+        async_request = self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options)
+        if external_finalize_fn is not None:
+            async_request.add_finalize_fn(external_finalize_fn)
+        call_idx = self.async_calls_queue.schedule_async_request(async_request)
+        logging.debug(f'Scheduled an async call #{call_idx}')
+
+    @_debug_time('AsyncFinalizableCheckpointIO.maybe_finalize_save_checkpoint')
+    def maybe_finalize_save_checkpoint(self, blocking: bool = False):
+        """Performs checkpoint finalization (if possible).
+
+        Args:
+            blocking (bool, optional): if True, waits until all async saves are
+                completed. Otherwise, finalizes only those async calls which are
+                already done on all ranks. Defaults to False.
+        """
+        call_idx_finalized = self.async_calls_queue.maybe_finalize_async_calls(blocking)
+        if call_idx_finalized:
+            logging.debug(f'Finalized async calls: {[f"#{idx}" for idx in call_idx_finalized]}')
+        return len(call_idx_finalized) > 0
+
+    def teardown(self) -> None:
+        """Warns if there are any pending checkpoint saves."""
+        super().teardown()
+        if self.async_calls_queue.get_num_unfinalized_calls() > 0:
+            # Can't do finalization now because some ranks might be lost
+            logging.warning('Some async checkpoint saves might be not finalized properly.')
+
+
+class AsyncFinalizerCallback(Callback):
+    """Callback which finalizes async saves initiated by the AsyncFinalizableCheckpointIO.
+
+    Tries to perform non-blocking finalization on train_batch_end and train_epoch_end.
+    On train_end performs a blocking finalization of all pending checkpoints.
+    """
+
+    def on_train_batch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=False)
+
+    def on_train_epoch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=False)
+
+    def on_train_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        checkpoint_io = self._get_checkpoint_io(trainer)
+        if checkpoint_io.async_calls_queue.get_num_unfinalized_calls() > 0:
+            logging.info('Pending async checkpoint saves. Finalizing them synchronously now')
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=True)
+
+    def _get_checkpoint_io(self, trainer) -> AsyncFinalizableCheckpointIO:
+        checkpoint_io = trainer.strategy.checkpoint_io
+        if not isinstance(checkpoint_io, AsyncFinalizableCheckpointIO):
+            raise ValueError(f'Async finalizer requires an async compatible CheckpointIO, got: {checkpoint_io}')
+        return checkpoint_io
+
+
+class DistributedCheckpointIO(AsyncCompatibleCheckpointIO):
+    """CheckpointIO for a distributed checkpoint format.
+
+    Args:
+        save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving.
+        load_directly_on_device (bool, optional): if True, loads the weights directly
+            on GPU. Has effect only for `zarr` based checkpoints (PyT Distributed
+            always loads on device). Defaults to True.
+        async_save (bool): whether to save asynchronously. Should be set to True if
+            this class will be wrapped with AsyncFinalizableCheckpointIO.
+    """
+
+    def __init__(
+        self,
+        save_ckpt_format: str,
+        load_directly_on_device: bool = True,
+        async_save: bool = False,
+    ):
+        super().__init__()
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+
+        self.save_ckpt_format = save_ckpt_format
+        self.load_directly_on_device = load_directly_on_device
+        self.async_save = async_save
+        self.save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
+
+    @classmethod
+    def from_config(cls, model_cfg: dict, async_save: bool = False):
+        """Instantiates a DistributedCheckpointIO from a config dict.
+
+        Args:
+            model_cfg (dict): model config dict. Most of the configuration
+                is extracted from this config.
+            async_save (bool, optional): async_save flag is not part of the model config,
+                it should be provided separately. Defaults to False.
+        """
+        return cls(
+            save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'),
+            load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
+            async_save=async_save,
+        )
+
+    @_debug_time('DistributedCheckpointIO.save_checkpoint')
+    def save_checkpoint(
+        self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None
+    ) -> Optional['AsyncRequest']:
+        """Saves a distributed checkpoint. Creates the checkpoint root directory if doesn't exist.
+
+        Args:
+            checkpoint (Dict[str, Any]): sharded state dict to save
+            path (_PATH): checkpoint directory
+            storage_options (Any, optional): Optional parameters when saving the checkpoint
+        """
+        fs = get_filesystem(path)
+        fs.makedirs(path, exist_ok=True)
+
+        dist_checkpointing.save(
+            sharded_state_dict=checkpoint, checkpoint_dir=path, sharded_strategy=self.save_sharded_strategy
+        )
+        if not self.async_save:
+            return None
+        # NOTE: this logic will be simplified in MCore v0.7
+        assert self.save_sharded_strategy.async_request is not None
+        async_request = self.save_sharded_strategy.async_request
+        self.save_sharded_strategy.async_request = None
+        return async_request
+
+    @_debug_time('DistributedCheckpointIO.load_checkpoint')
+    def load_checkpoint(
+        self,
+        path: _PATH,
+        map_location: Optional[Any] = None,
+        sharded_state_dict: Dict[str, Any] = None,
+        strict: Optional[bool] = True,
+    ) -> Dict[str, Any]:
+        """Loads a distributed checkpoint.
+
+        Args:
+            path (_PATH): checkpoint directory
+            map_location (Any, optional): required to be None in this implementation
+            sharded_state_dict (Dict[str, Any], optional): state dict which
+                defines the loading procedure for the distributed checkpoint.
+                Defaults to None to comply with the CheckpointIO interface,
+                but it's a required argument.
+
+        Returns:
+            Dist[str, Any]: loaded checkpoint.
+        """
+        if sharded_state_dict is None:
+            raise ValueError('DistributedCheckpointIO requires passing sharded_state_dict argument to load_checkpoint')
+        if map_location is not None:
+            raise ValueError('DistributedCheckpointIO doesnt handle map_location argument')
+
+        if self.save_ckpt_format == 'zarr' and self.load_directly_on_device:
+            sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
+        else:
+            sharded_strategy = None
+
+        if not strict:
+            sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict)
+
+        return dist_checkpointing.load(
+            sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy
+        )
+
+    def adjust_non_strict_load(self, path: _PATH, sharded_state_dict: Dict[str, Any]):
+        ckpt_sharded_metadata = dist_checkpointing.load_tensors_metadata(path)
+        loaded_keys = []
+        missing_keys = []
+        unexpected_keys = []
+
+        def should_remove_missing_sharded_base(x: Any):
+            if isinstance(x, ShardedBase):
+                if x.key in ckpt_sharded_metadata:
+                    loaded_keys.append(x.key)
+                    return False
+                else:
+                    unexpected_keys.append(x.key)
+                    return True
+            return False
+
+        _, sharded_state_dict = extract_matching_values(sharded_state_dict, should_remove_missing_sharded_base)
+        logging.info(f'The following keys are not in the checkpoint and will not be loaded: {unexpected_keys}')
+
+        # TODO: compute missing_keys by:
+        #  1. all_gather_object of loaded_keys
+        #  2. missing_keys = ckpt_sharded_metadata.keys() - loaded_keys
+        return sharded_state_dict
+
+    @_debug_time('DistributedCheckpointIO.remove_checkpoint')
+    def remove_checkpoint(self, path: _PATH) -> None:
+        """Remove a distributed checkpoint.
+
+        Due to potentially large number of files, the implementation remove the whole directory at once.
+        """
+        shutil.rmtree(path, ignore_errors=True)
+
+    def _determine_dist_ckpt_save_strategy(self):
+        """Determine the saving strategy based on constructor args.
+
+        If self.async_save is True instantiates an async PyT Dist strategy,
+        otherwise relies on MCore to create a proper strategy based on ckpt format.
+        """
+        save_strategy = (self.save_ckpt_format, 1)
+        if self.async_save:
+            if save_strategy[0] != 'torch_dist':
+                raise ValueError('Async dist-ckpt save supported only for torch_dist format')
+            save_strategy = TorchDistAsyncSaveShardedStrategy('torch_dist', 1)
+
+        logging.info(f'Using {save_strategy} dist-ckpt save strategy.')
+        return save_strategy
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index e532297d9747..e1d1f2e94586 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -21,19 +21,23 @@
 
 import pytorch_lightning
 import torch
+from _weakref import proxy
+
+from lightning_fabric.utilities.cloud_io import get_filesystem
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
 from nemo.collections.common.callbacks import EMA
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
 
 class NeMoModelCheckpoint(ModelCheckpoint):
-    """ Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
-    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based 
+    """Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
+    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based
     on the best checkpoint saved (according to the monitor value).
     Also contains func to save the EMA copy of the model.
     """
@@ -48,6 +52,7 @@ def __init__(
         postfix: str = ".nemo",
         n_resume: bool = False,
         model_parallel_size: int = None,
+        async_save: bool = False,  # controls only finalize callbacks
         **kwargs,
     ):
         # Parse and store "extended" parameters: save_best model and postfix.
@@ -64,6 +69,13 @@ def __init__(
         self.postfix = postfix
         self.previous_best_path = ""
         self.model_parallel_size = model_parallel_size
+        self.async_save = async_save
+        self.async_finalize_cb = None
+        # Checkpoints which removal is deferred until async save is done.
+        # Each element of `deferred_ckpts_to_remove` is a growing list
+        # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
+        # is called, the last element is frozen and a new element is added.
+        self.deferred_ckpts_to_remove: List[List[str]] = []
 
         # `prefix` is deprecated
         if 'prefix' in kwargs:
@@ -188,9 +200,7 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.')
         # since we are creating tarfile artifacts we need to update .nemo path
-        app_state.model_restore_path = os.path.abspath(
-            os.path.expanduser(os.path.join(self.dirpath, self.prefix + self.postfix))
-        )
+        app_state.model_restore_path = self._format_nemo_checkpoint_name()
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path)
         else:
@@ -213,14 +223,19 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             pl_module.load_state_dict(checkpoint, strict=True)
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New best .nemo model saved to: {app_state.model_restore_path}")
             pl_module.load_state_dict(old_state_dict, strict=True)
         else:
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New .nemo model saved to: {app_state.model_restore_path}")
+        if backup_path is not None and is_global_rank_zero():
+            logging.info(f'Removing old .nemo backup {backup_path}')
+            get_filesystem(backup_path).rm(backup_path)
         return output
 
     def on_train_end(self, trainer, pl_module):
@@ -236,7 +251,10 @@ def on_train_end(self, trainer, pl_module):
                 should_save_last_checkpoint = True
             if should_save_last_checkpoint:
                 monitor_candidates = self._monitor_candidates(trainer)
-                super()._save_last_checkpoint(trainer, monitor_candidates)
+                if self.last_model_path == self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST):
+                    logging.debug(f'Last checkpoint {self.last_model_path} already saved')
+                else:
+                    super()._save_last_checkpoint(trainer, monitor_candidates)
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
@@ -256,7 +274,47 @@ def on_train_end(self, trainer, pl_module):
                 trainer._checkpoint_connector.restore(self.best_model_path)
 
         if self.save_nemo_on_train_end:
-            pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
+            pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
+            if backup_path is not None and is_global_rank_zero():
+                logging.info(f'Removing old .nemo backup {backup_path}')
+                get_filesystem(backup_path).rm(backup_path)
+
+    def _backup_existing_nemo_ckpt(self, trainer) -> Optional[str]:
+        """Search for an available name with version infix and rename existing checkpoint.
+
+        NOTE: this behavior is slightly different from regular checkpoints.
+        PTL creates new regular checkpoint with the first available name.
+        Here, for backward compatibility, we create .nemo checkpoint as before
+        and create a backup under the first available name.
+
+        Args:
+            trainer (Trainer): trainer instance.
+
+        Returns:
+            Path to the backup checkpoint or None, if no backup was created
+        """
+        base_path = self._format_nemo_checkpoint_name()
+        available_path = base_path
+        if self._enable_version_counter:
+            version_cnt = self.STARTING_VERSION
+            while self.file_exists(available_path, trainer, check_dist_ckpt=False):
+                available_path = self._format_nemo_checkpoint_name(version_cnt)
+                version_cnt += 1
+        if available_path == base_path:
+            # no existing ckpt, no need to backup
+            return None
+        if trainer.is_global_zero:
+            logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
+            shutil.move(base_path, available_path)
+        trainer.strategy.barrier()
+        return available_path
+
+    def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str:
+        version_infix = '' if ver is None else f'{self.CHECKPOINT_JOIN_CHAR}v{ver}'
+        return os.path.abspath(
+            os.path.expanduser(os.path.join(self.dirpath, self.prefix + version_infix + self.postfix))
+        )
 
     def _del_model_without_trainer(self, filepath: str) -> None:
 
@@ -299,15 +357,15 @@ def _ema_callback(self, trainer: 'pytorch_lightning.Trainer') -> Optional[EMA]:
 
     @staticmethod
     def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str]) -> Path:
-        """ Format the path to the unfinished checkpoint marker file.
-        
+        """Format the path to the unfinished checkpoint marker file.
+
         If the marker file exists, corresponding checkpoint is considered unfinished/incomplete.
         NOTE: Marker path for the EMA checkpoint part is the same as for the original checkpoint.
-        
+
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
               Does not need to exist.
-            
+
         Returns:
             Path to the unfinished checkpoint marker file.
         """
@@ -319,7 +377,7 @@ def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str])
 
     @staticmethod
     def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
-        """ Check if the checkpoint is unfinished.
+        """Check if the checkpoint is unfinished.
 
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
@@ -332,7 +390,7 @@ def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
 
     @staticmethod
     def set_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barrier_after=False) -> None:
-        """ Marks given checkpoint as unfinished.
+        """Marks given checkpoint as unfinished.
 
         Args:
             checkpoint_filepath: Path to the checkpoint file or dir.
@@ -367,12 +425,19 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
         except:
             return
 
+    def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check_dist_ckpt: bool = True) -> bool:
+        """Checks if a file or a file without a suffix (distributed checkpoint) exists."""
+        exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
+        return trainer.strategy.broadcast(exists)
+
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
         if ema_callback is not None:
+            if self.async_save:
+                raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
                 super()._save_checkpoint(trainer, filepath)
 
@@ -382,13 +447,71 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 if self.verbose:
                     rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
                 super()._save_checkpoint(trainer, filepath)
+            self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
-            super()._save_checkpoint(trainer, filepath)
-        # barrier_before=True, so all ranks synchronize before removing the unfinished checkpoint marker
-        # we don't want to remove the marker until all checkpointing is done.
-        self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
+            # Async save passed the finalization function to checkpoint_io,
+            # sync save calls the finalization function immediately after save.
+            finalize_fn = self._get_finalize_save_checkpoint_callback(trainer, filepath, trainer.global_step)
+            if self.async_save:
+                checkpoint_io = trainer.strategy.checkpoint_io
+                if not isinstance(checkpoint_io, AsyncFinalizableCheckpointIO):
+                    raise ValueError('Async save requires async compatible CheckpointIO')
+                storage_options = dict(finalize_fn=finalize_fn)
+                # Each upcoming ckpt removal request will be executed as part of this save finalization
+                self.deferred_ckpts_to_remove.append([])
+            else:
+                storage_options = None
+            trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
+            if self.async_save:
+                logging.info(f'Scheduled async checkpoint save for {filepath}')
+            else:
+                finalize_fn()
+
+    def _get_finalize_save_checkpoint_callback(
+        self, trainer: 'pytorch_lightning.Trainer', filepath: str, global_step: int
+    ):
+        """Creates a callback that can be used to finalize async (and sync) ckpt saves."""
+
+        def _cb():
+            logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
+            self._last_global_step_saved = global_step
+            self._last_checkpoint_saved = filepath
+
+            # notify loggers
+            if trainer.is_global_zero:
+                for logger in trainer.loggers:
+                    logger.after_save_checkpoint(proxy(self))
+
+            # barrier_before=True, so all ranks synchronize before removing the unfinished checkpoint marker
+            # we don't want to remove the marker until all checkpointing is done.
+            self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
 
-    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str) -> None:
+            if not self.async_save:
+                return
+
+            logging.info(f'Async checkpoint save for step {global_step} ({filepath}) finalized successfully.')
+
+            # Remove checkpoints marked for removal by `self._remove_checkpoint`
+            # For each finalization there is exactly one entry in self.deferred_ckpts_to_remove
+            assert self.deferred_ckpts_to_remove
+            ckpts_to_remove = self.deferred_ckpts_to_remove.pop(0)
+            logging.debug(f'Checkpoints to remove: {ckpts_to_remove}')
+            for ckpt_to_remove in ckpts_to_remove:
+                self._remove_checkpoint(trainer, ckpt_to_remove, override_async=True)
+
+        return _cb
+
+    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str, override_async=False) -> None:
+        """Performs checkpoint removal or deferred removal.
+
+        With async save, `self._remove_checkpoint` is called before the checkpoint
+        is actually finished so we can't remove it. Instead we add it to
+        `self.deferred_ckpts_to_remove` for future removal.
+        """
+        if self.async_save and not override_async:
+            # Register checkpoint removal in the last (active) checkpoint removal list
+            self.deferred_ckpts_to_remove[-1].append(filepath)
+            return
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during removal, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
@@ -463,7 +586,7 @@ def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, curren
         A checkpoint won't be deleted if any of the cases apply:
         - The previous checkpoint is the same as the current checkpoint (means the old was already overwritten by new)
         - The previous checkpoint is not in the current checkpoint directory and the filesystem is local
-        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local 
+        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local
             and the resumed from checkpoint is not the last checkpoint
         """
         if previous == current:
diff --git a/nemo/utils/callbacks/torch_dist_async.py b/nemo/utils/callbacks/torch_dist_async.py
new file mode 100644
index 000000000000..1cd226af9cdb
--- /dev/null
+++ b/nemo/utils/callbacks/torch_dist_async.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import deque
+from pathlib import Path
+from time import time
+from typing import Callable, List, NamedTuple, Optional, Tuple
+
+import torch
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.strategies.filesystem_async import FileSystemWriterAsync
+from megatron.core.dist_checkpointing.strategies.state_dict_saver import (
+    save_state_dict_async_finalize,
+    save_state_dict_async_plan,
+)
+from megatron.core.dist_checkpointing.strategies.torch import (
+    MCoreSavePlanner,
+    TorchDistSaveShardedStrategy,
+    _replace_state_dict_keys_with_sharded_keys,
+    mcore_to_pyt_state_dict,
+)
+from torch import multiprocessing as mp
+
+from nemo.utils import logging
+
+
+class TorchDistAsyncSaveShardedStrategy(TorchDistSaveShardedStrategy):
+    """Async save strategy for the PyT Distributed format.
+
+    NOTE: this class will be removed and replaced with an MCore version
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.async_request = None
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """Translates MCore ShardedTensors to PyT ShardedTensors and saves in PyT Distributed format.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to save
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns: None
+        """
+        # Translate the state dict
+        (
+            sharded_state_dict,
+            flat_mapping,
+            rename_mapping,
+        ) = _replace_state_dict_keys_with_sharded_keys(sharded_state_dict, self.keep_only_main_replica)
+        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False)
+        # Use PyT saving mechanism
+        writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count)
+
+        save_state_dict_ret = save_state_dict_async_plan(
+            pyt_state_dict,
+            writer,
+            None,
+            planner=MCoreSavePlanner(),
+        )
+        self.async_request = self._get_save_and_finalize_callbacks(writer, save_state_dict_ret)
+        return self.async_request
+
+    def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret):
+        save_fn_args = writer.get_save_function_and_args()
+        if save_fn_args is None:  # this check can be removed with MCore v0.7
+            save_fn_args = None, ()
+        save_fn, save_args = save_fn_args
+
+        def finalize_fn():
+            save_state_dict_async_finalize(*save_state_dict_ret)
+            torch.distributed.barrier()
+
+        return AsyncRequest(save_fn, save_args, [finalize_fn])
+
+
+class AsyncRequest(NamedTuple):
+    """Represents an async request that needs to be scheduled for execution.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Args:
+        async_fn (Callable, optional): async function to call. None represents noop.
+        async_fn_args (Tuple): args to pass to `async_fn`.
+        finalize_fns (List[Callable]): list of functions to call to finalize the request.
+            These functions will be called synchronously after `async_fn` is done
+            *on all ranks*.
+    """
+
+    async_fn: Optional[Callable]
+    async_fn_args: Tuple
+    finalize_fns: List[Callable]
+    is_frozen: bool = False
+
+    def add_finalize_fn(self, fn: Callable) -> None:
+        """Adds a new finalize function to the request.
+
+        Args:
+            fn (Callable): function to add to the async request. This function
+                will be called *after* existing finalization functions.
+
+        Returns:
+            None
+        """
+        if self.is_frozen:
+            raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest')
+        self.finalize_fns.append(fn)
+
+    def execute_sync(self) -> None:
+        """Helper to synchronously execute the request.
+
+        This logic is equivalent to what should happen in case of the async call.
+        """
+        if self.async_fn is not None:
+            self.async_fn(*self.async_fn_args)
+        torch.distributed.barrier()
+        for finalize_fn in self.finalize_fns:
+            finalize_fn()
+
+    def freeze(self) -> 'AsyncRequest':
+        """Freezes the async request, disallowing adding new finalization functions.
+
+        Returns:
+            AsyncRequest: new async request with all same fields except for the
+                `is_frozen` flag.
+        """
+        return self._replace(is_frozen=True)
+
+
+class DistributedAsyncCaller:
+    """Wrapper around mp.Process that ensures correct semantic of distributed finalization.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Starts process asynchronously and allows checking if all processes on all ranks are done.
+    """
+
+    def __init__(self):
+        self.process: Optional[mp.Process] = None
+        self.start_time: Optional[float] = None
+
+    def schedule_async_call(
+        self,
+        async_fn: Optional[Callable],
+        save_args: Tuple,
+    ) -> None:
+        """Spawn a process with `async_fn` as the target.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_fn (Callable, optional): async function to call. If None,
+                no process will be started.
+            save_args (Tuple): async function args.
+        """
+        if async_fn is None:
+            return  # nothing to do
+        torch.cuda.synchronize()
+        ctx = mp.get_context('fork')
+        self.start_time = time()
+        self.process = ctx.Process(
+            target=async_fn,
+            args=save_args,
+        )
+        self.process.start()
+
+    def is_current_async_call_done(self, blocking=False) -> bool:
+        """Check if async save is finished on all ranks.
+
+        For semantic correctness, requires rank synchronization in each check.
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until the call is done
+                on all ranks. Otherwise, returns immediately if at least one rank
+                is still active. Defaults to False.
+
+        Returns:
+            bool: True if all ranks are done (immediately of after active wait
+                if `blocking` is True), False if at least one rank is still active.
+        """
+        # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce)
+        is_alive = int(self.process.is_alive()) if self.process is not None else 0
+        ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device())
+        logging.debug(f"[rank {torch.distributed.get_rank()}] DistributedAsyncCaller is_alive:{is_alive}")
+        torch.distributed.all_reduce(ten)
+        if ten[0] > 0 and not blocking:
+            return False
+        else:
+            if self.process is not None:
+                logging.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
+                self.process.join()
+                self.process = None
+
+                logging.debug(
+                    f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking"
+                )
+                self.start_time = None
+            return True
+
+
+class _ActiveAsyncRequest(NamedTuple):
+    """Helper to represent an active async call.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Args:
+        idx (int): index of the call (starting from 0)
+        async_caller (DistributedAsyncCaller): async caller instance that represents
+            the async process handling the async request
+        async_request (AsyncRequest):  async request that is being called
+    """
+
+    idx: int
+    async_caller: DistributedAsyncCaller
+    async_request: AsyncRequest
+
+
+class AsyncCallsQueue:
+    """Manages a queue of async calls.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Allows adding a new async call with `schedule_async_request` and finalizing
+    active calls with `maybe_finalize_async_calls`.
+    """
+
+    def __init__(self):
+        self.async_calls: deque[_ActiveAsyncRequest] = deque([])
+        self.call_idx: int = -1
+
+    def schedule_async_request(self, async_request: AsyncRequest) -> int:
+        """Start a new async call and add it to a queue of active async calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_request (AsyncRequest): async request to start.
+
+        Returns:
+            int: index of the async call that was started.
+                This can help the user keep track of the async calls.
+        """
+        self.call_idx += 1
+        async_caller = DistributedAsyncCaller()
+        async_request = async_request.freeze()
+        async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args)
+        self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request))
+        return self.call_idx
+
+    def maybe_finalize_async_calls(self, blocking=False) -> List[int]:
+        """Finalizes all available calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until all active requests
+                are done. Otherwise, finalizes only the async request that already
+                finished. Defaults to False.
+        Returns:
+            List[int]: list of indices (as returned by `schedule_async_request`)
+                of async calls that have been successfully finalized.
+        """
+        call_idx_finalized = []
+        while self.async_calls:
+            next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking)
+            if not next_async_done:
+                break
+            call_idx, _, async_request = self.async_calls.popleft()
+            for finalize_fn in async_request.finalize_fns:
+                finalize_fn()
+            ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
+            torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
+            assert (
+                ten.item() == call_idx
+            ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization'
+            call_idx_finalized.append(call_idx)
+        return call_idx_finalized
+
+    def get_num_unfinalized_calls(self):
+        """Get the number of active async calls."""
+        return len(self.async_calls)
+
+    def close(self):
+        """Finalize all calls upon closing."""
+        self.maybe_finalize_async_calls(blocking=True)
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index be9a6e8cfbb3..9e8b55eade1f 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -51,11 +51,11 @@
 
 
 class NotFoundError(NeMoBaseException):
-    """ Raised when a file or folder is not found"""
+    """Raised when a file or folder is not found"""
 
 
 class LoggerMisconfigurationError(NeMoBaseException):
-    """ Raised when a mismatch between trainer.logger and exp_manager occurs"""
+    """Raised when a mismatch between trainer.logger and exp_manager occurs"""
 
     def __init__(self, message):
         message = (
@@ -66,7 +66,7 @@ def __init__(self, message):
 
 
 class CheckpointMisconfigurationError(NeMoBaseException):
-    """ Raised when a mismatch between trainer.callbacks and exp_manager occurs"""
+    """Raised when a mismatch between trainer.callbacks and exp_manager occurs"""
 
 
 @dataclass
@@ -106,6 +106,7 @@ class CallbackParams:
     save_nemo_on_train_end: Optional[bool] = True  # Whether to automatically save .nemo file durin on_train_end hook
     model_parallel_size: Optional[int] = None  # tensor parallel size * pipeline parallel size
     save_on_train_epoch_end: Optional[bool] = False  # Save after training, not after validation
+    async_save: Optional[bool] = False  # save the checkpoint asynchronously
 
 
 @dataclass
@@ -128,8 +129,7 @@ class EMAParams:
 
 @dataclass
 class ExpManagerConfig:
-    """Experiment Manager config for validation of passed arguments.
-    """
+    """Experiment Manager config for validation of passed arguments."""
 
     # Log dir creation parameters
     explicit_log_dir: Optional[str] = None
@@ -304,16 +304,16 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 recent checkpoint under ``*last.ckpt``, and the final checkpoint after training completes under ``*end.ckpt``.
                 Defaults to True.
             - create_early_stopping_callback (bool): Flag to decide if early stopping should be used to stop training. Default is False.
-             See EarlyStoppingParams dataclass above.
+                See EarlyStoppingParams dataclass above.
             - create_preemption_callback (bool): Flag to decide whether to enable preemption callback to save checkpoints and exit training
-             immediately upon preemption. Default is True.
+                immediately upon preemption. Default is True.
             - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                 copies no files.
             - log_local_rank_0_only (bool): Whether to only create log files for local rank 0. Defaults to False.
                 Set this to True if you are using DDP with many GPUs and do not want many log files in your exp dir.
             - log_global_rank_0_only (bool): Whether to only create log files for global rank 0. Defaults to False.
                 Set this to True if you are using DDP with many GPUs and do not want many log files in your exp dir.
-            - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want 
+            - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
             - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
 
@@ -336,6 +336,10 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
 
     # Ensure passed cfg is compliant with ExpManagerConfig
     schema = OmegaConf.structured(ExpManagerConfig)
+    # TODO: remove this check
+    if is_global_rank_zero():
+        logging.info('ExpManager schema')
+        logging.info(schema)
     if isinstance(cfg, dict):
         cfg = OmegaConf.create(cfg)
     elif not isinstance(cfg, DictConfig):
@@ -681,7 +685,7 @@ def check_resume(
 def check_explicit_log_dir(
     trainer: 'pytorch_lightning.Trainer', explicit_log_dir: Union[Path, str], exp_dir: str, name: str, version: str
 ) -> Tuple[Path, str, str, str]:
-    """ Checks that the passed arguments are compatible with explicit_log_dir.
+    """Checks that the passed arguments are compatible with explicit_log_dir.
 
     Returns:
         log_dir (Path): the log_dir
@@ -918,7 +922,7 @@ def configure_checkpointing(
     params: 'DictConfig',
     create_preemption_callback: bool,
 ):
-    """ Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
+    """Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
     callback
     """
     for callback in trainer.callbacks:
@@ -995,7 +999,12 @@ def check_slurm(trainer):
 class StatelessTimer(Timer):
     """Extension of PTL timers to be per run."""
 
-    def __init__(self, duration: timedelta = None, interval: str = Interval.step, verbose: bool = True,) -> None:
+    def __init__(
+        self,
+        duration: timedelta = None,
+        interval: str = Interval.step,
+        verbose: bool = True,
+    ) -> None:
         super().__init__(duration, interval, verbose)
 
     # Override PTL Timer's state dict to not store elapsed time information so that we can restore and continue training.
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index 95d1bc414625..f4eefd39a9ea 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -24,7 +24,7 @@
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import wrapt
 
@@ -92,6 +92,24 @@ def load_config(model_file: str) -> DictConfig:
     return model_config
 
 
+def unwrap_model(model, module_instances: Union[Type, Tuple[Type]]):
+    """Unwrap model from wrapper classes like Float16Module, for example."""
+
+    # TODO: Import this from megatron.core once moved there from megatron.training.
+    return_list = True
+    if not isinstance(model, list):
+        model = [model]
+        return_list = False
+    unwrapped_model = []
+    for model_module in model:
+        while isinstance(model_module, module_instances):
+            model_module = model_module.module
+        unwrapped_model.append(model_module)
+    if not return_list:
+        return unwrapped_model[0]
+    return unwrapped_model
+
+
 def param_is_not_shared(param):
     return not hasattr(param, 'shared') or not param.shared
 
diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
new file mode 100644
index 000000000000..2a5a14f83823
--- /dev/null
+++ b/nemo/utils/sequence_packing_utils.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from typing import Dict, List
+
+import numpy as np
+from tqdm import tqdm
+
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
+
+
+def find_first_bin_that_fits(bins: List[List[int]], s: int, bin_size: int) -> int:
+    """
+    Finds the first bin in a list of bins that has enough space to fit a sequence of size 's'.
+
+    Args:
+      bins: A list of lists, where each inner list represents a bin and contains the current elements in that bin.
+      s: The size of the sequence to be placed in a bin.
+      bin_size: The maximum capacity of each bin.
+
+    Returns:
+      The index of the first bin that can fit the sequence 's', or -1 if no such bin exists.
+    """
+    for i, abin in enumerate(bins):
+        if sum(abin) + s <= bin_size:
+            return i
+    return -1
+
+
+def first_fit(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, where each inner list represents a bin and contains the indices of the sequences assigned to that bin.
+    """
+    res = []
+    for s in seqlens:
+        first_bin = find_first_bin_that_fits(res, s, pack_size)
+        if first_bin == -1:  # open a new bin
+            res.append([s])
+        else:
+            res[first_bin].append(s)
+    return res
+
+
+def first_fit_decreasing(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit Decreasing algorithm.
+
+    This is a variation of the First-Fit algorithm where the sequences are sorted by decreasing length before packing.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    sorted_seqlens = sorted(seqlens, reverse=True)
+    return first_fit(sorted_seqlens, pack_size)
+
+
+def first_fit_shuffle(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit with Shuffling algorithm.
+
+    This variation shuffles the order of the sequences before applying the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    shuffled_seqlens = seqlens[:]
+    np.random.shuffle(shuffled_seqlens)
+    return first_fit(shuffled_seqlens, pack_size)
+
+
+def create_hist(dataset: np.array, truncate_seq_len: int):
+    """
+    Creates a histogram of sequence lengths from a tokenized dataset.
+
+    This function analyzes the tokenized dataset and creates a histogram showing the distribution of sequence lengths.
+
+    Args:
+      dataset: A NumPy array containing the tokenized sequences. Each element is a dictionary that contains at minimum
+               the key `input_ids`.
+      truncate_seq_len: The maximum sequence length to consider in the histogram.
+
+    Returns:
+      sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences from the dataset.
+      histogram: A list representing the histogram data (number of sequences for each length).
+    """
+    logging.info("Creating histogram from tokenized dataset...")
+
+    sequences = collections.defaultdict(list)
+    counts = [0] * truncate_seq_len
+
+    for item_dict in dataset:
+        seq_len = len(item_dict['input_ids']) - 1
+        sequences[seq_len].append(item_dict)
+        counts[seq_len] += 1
+
+    logging.debug("Histogram of sequence lengths")
+    logging.debug(counts)
+
+    histogram = []
+    for seq_len in range(truncate_seq_len):
+        histogram.append(len(sequences[seq_len]))
+
+    return sequences, histogram
+
+
+def create_packing_strategy(
+    histogram: List[int], pack_size: int, packing_algorithm: str = 'first_fit'
+) -> List[List[int]]:
+    """
+    Packs sequences into bins using the specified packing algorithm.
+
+    This function takes the histogram of sequence lengths, desired pack size, and a string representing the packing
+    algorithm to use. It then calls the corresponding function (e.g., 'first_fit_decreasing') and performs the
+    packing process using only sequence lengths as input (without the actual sequences).
+
+    Args:
+          histogram: A list representing the histogram data (number of sequences for each length).
+          pack_size: The maximum capacity of each bin.
+          packing_algorithm: One of the supported packing algorithms from ['first_fit_decreasing', 'first_fit_shuffle']
+
+    Returns:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin.
+    """
+
+    logging.info(f"Packing sequences to length {pack_size}...")
+
+    all_seq_lens = []
+    for i, count in enumerate(histogram):
+        all_seq_lens.extend([i] * count)
+
+    packing_fn = globals()[packing_algorithm]
+    assignments = packing_fn(all_seq_lens, pack_size)
+    packed_seq_lens = [sum(x) for x in assignments]
+    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
+
+    logging.debug("Packed sequence lengths:")
+    logging.debug(packed_seq_lens)
+    logging.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
+    logging.info(
+        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
+    )
+    return assignments
+
+
+def fill_packing_strategy(
+    assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int
+) -> List[Dict]:
+    """
+    Fills the packing strategy with actual sequence data based on assignments and sequence information.
+
+    This function takes the assignments generated by the packing algorithm (containing sequence length indices),
+    the original sequences data, and the pack size. It iterates through the assignments, retrieves the corresponding
+    sequences from the sequences dictionary, and constructs the final output data structure with input IDs, loss masks
+    (if available), and starting indices for each sequence in a packed sequence.
+
+    Args:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin (output of 'create_packing_strategy').
+          sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences
+                      from the dataset (output of 'create_hist').
+          pack_size: The maximum capacity of each bin.
+
+    Returns:
+          output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs,
+                        loss mask (if available), and starting indices.
+    """
+    ifile_handles = dict()
+    for seq_len in tqdm(range(pack_size + 1)):
+        per_seq_data = sequences[seq_len]
+        if len(per_seq_data) > 0:
+            perm = np.random.permutation(len(per_seq_data))
+            input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist()
+            try:
+                loss_mask = np.array(
+                    [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
+                )[perm].tolist()
+            except KeyError:
+                loss_mask = None
+            ifile_handles[seq_len] = (input_ids, loss_mask)
+
+    input_ids, loss_mask, seq_start_id = {}, {}, {}
+
+    for oindex, assignment in tqdm(enumerate(assignments), total=len(assignments)):
+        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+
+        for seq_length in assignment:
+            _input_ids.extend(ifile_handles[seq_length][0].pop())
+            _loss_mask.extend(ifile_handles[seq_length][1].pop())
+            _seq_start_id.append(len(_input_ids))
+
+        input_ids[oindex] = _input_ids
+        loss_mask[oindex] = _loss_mask
+        seq_start_id[oindex] = _seq_start_id[:-1]
+
+    output_data = []
+    for i in range(len(input_ids)):
+        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
+        output_data.append(item_dict)
+
+    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    return output_data
diff --git a/nemo/utils/trt_utils.py b/nemo/utils/trt_utils.py
index 73e899532691..a355a8e9e77e 100644
--- a/nemo/utils/trt_utils.py
+++ b/nemo/utils/trt_utils.py
@@ -32,6 +32,8 @@ def build_engine(
     enable_preview=False,
     timing_cache=None,
     workspace_size=0,
+    int8=False,
+    builder_optimization_level=None,
 ):
     print(f"Building TensorRT engine for {onnx_path}: {output_path}")
     p = Profile()
@@ -53,6 +55,8 @@ def build_engine(
             profiles=[p],
             preview_features=preview_features,
             load_timing_cache=timing_cache,
+            int8=int8,
+            builder_optimization_level=builder_optimization_level,
             **config_kwargs,
         ),
         save_timing_cache=timing_cache,
diff --git a/pyproject.toml b/pyproject.toml
index 680eccf8156b..a9c1ba7938fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,8 +27,35 @@ extend_skip = ["setup.py", "docs/source/conf.py"]
 [tool.black]
 line_length = 119
 skip_string_normalization = true
-required_version = "19.10b0"  # recongized by future versions, disallows to reformat code with incompatible versions
-
+# major year version is stable, see details in
+# https://black.readthedocs.io/en/stable/the_black_code_style/index.html
+# `required_version` is necessary for consistency (other `black` versions will fail to reformat files)
+required_version = "24"
+target-version = ['py310', 'py311', 'py312']
+extend-exclude = '''
+# A regex preceded with ^/ will apply only to files and directories
+# in the root of the project.
+# include here only current collections, new collections should not be ignored
+# exclude the collection once it is reformatted (due to changes in PRs)
+(
+  ^\/docs\/
+  | ^\/external\/
+  | ^\/examples\/
+  | ^\/nemo\/collections\/asr\/
+  | ^\/nemo\/collections\/common\/
+  | ^\/nemo\/collections\/multimodal\/
+  | ^\/nemo\/collections\/nlp\/
+  | ^\/nemo\/collections\/tts\/
+  | ^\/nemo\/collections\/vision\/
+  | ^\/nemo\/core\/
+  | ^\/nemo\/utils\/
+  | ^\/scripts\/
+  | ^\/tests\/
+  | ^\/tools\/
+  | ^\/tutorials\/
+  | ^\/setup.py
+)
+'''
 
 [tool.pytest.ini_options]
 # durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one.
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 20efa2b22013..e2a558929146 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,3 +1,4 @@
+fiddle
 huggingface_hub>=0.20.3
 numba
 numpy>=1.22
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index b7863714eb2d..30e839fd2ca8 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -1,5 +1,6 @@
 braceexpand
 editdistance
+einops
 g2p_en
 ipywidgets
 jiwer
diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
new file mode 100644
index 000000000000..c18f4e81ade3
--- /dev/null
+++ b/requirements/requirements_infer.txt
@@ -0,0 +1,4 @@
+nvidia-pytriton
+tensorstore==0.1.45
+zarr
+
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 6bd43cdfc9c7..cf996584da23 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,7 +1,9 @@
+cloudpickle
+fiddle
 hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>=2.2.1
 torchmetrics>=0.11.0
-transformers>=4.36.0
+transformers>=4.36.0,<=4.40.2
 wandb
 webdataset>=0.2.86
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index ee19c836e604..1fdce2c160d9 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,5 +1,6 @@
 addict
 clip
+decord
 diffusers>=0.19.3
 einops_exts
 imageio
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 46e82089f0ea..494a9ab6d672 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,4 +1,6 @@
+accelerated-scan
 boto3
+causal-conv1d==1.2.0.post2
 einops
 faiss-cpu
 fasttext
diff --git a/requirements/requirements_test.txt b/requirements/requirements_test.txt
index 9440405bc55b..f0a35f5b087e 100644
--- a/requirements/requirements_test.txt
+++ b/requirements/requirements_test.txt
@@ -1,4 +1,4 @@
-black==19.10b0
+black~=24.3
 click==8.0.2
 isort>5.1.0,<6.0.0
 parameterized
diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 278f7b879b28..a81fd33f47a2 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -19,6 +19,7 @@
      --input_name_or_path "thenlper/gte-large" \
      --output_path /path/to/output/nemo/file.nemo \
      --mcore True \
+     --post_process False \
      --precision 32
 ```
 """
@@ -62,6 +63,9 @@ def get_args():
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--post_process", type=bool, default=False, required=False, help="Whether to have the postprocessing modules"
+    )
     parser.add_argument(
         "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
@@ -81,6 +85,14 @@ def convert(args):
     trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
     model = MegatronBertModel(nemo_config.model, trainer)
 
+    if not args.post_process:
+        model.model.lm_head, model.model.encoder.final_layernorm, model.model.binary_head, model.model.output_layer = (
+            None,
+            None,
+            None,
+            None,
+        )
+
     nemo_state_dict = {}
     hf_config = hf_model.config.to_dict()
     hidden_size = hf_config["hidden_size"]
@@ -184,6 +196,19 @@ def convert(args):
         nemo_state_dict[LayerNorm2_weight_base_name] = param_to_weights(LayerNorm2_weight)
         nemo_state_dict[LayerNorm2_bias_base_name] = param_to_weights(LayerNorm2_bias)
 
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'
+        ]
+
     # Non-layer dependent keys
     word_embeddings_weight = hf_model.state_dict()['embeddings.word_embeddings.weight']
     position_embeddings_weight = hf_model.state_dict()['embeddings.position_embeddings.weight']
diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
index c35906dc78c1..1cbeeb41c66d 100644
--- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install orbax jax flax jaxlib`
+Requires to clone: https://github.com/google-deepmind/gemma.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_jax:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/jax/7b \
@@ -27,8 +28,8 @@
 
 import jax
 import torch
+from gemma.params import load_params, nest_params, param_remapper
 from omegaconf import OmegaConf
-from params import load_params, nest_params, param_remapper
 from transformer import TransformerConfig
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
index 583ee7893c0f..d14e5f7de551 100644
--- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install fairscale==0.4.13 immutabledict==4.1.0 tensorstore==0.1.45`
+Requires to clone: https://github.com/google/gemma_pytorch.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_pytorchh:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/pyt/7b.ckpt \
@@ -26,9 +27,9 @@
 from argparse import ArgumentParser
 
 import torch
-from model.config import get_config_for_2b, get_config_for_7b
-from model.model import CausalLM
-from model.tokenizer import Tokenizer
+from gemma.config import get_config_for_2b, get_config_for_7b
+from gemma.model import CausalLM
+from gemma.tokenizer import Tokenizer
 from omegaconf import OmegaConf
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -152,7 +153,8 @@ def adjust_tensor_shapes(model, nemo_state_dict):
             # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
             # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
             q_weight, k_weight, v_weight = qkv_weight.split(
-                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0,
+                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size],
+                dim=0,
             )
             q_weight = q_weight.reshape(head_num, head_size, hidden_size)
             k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
diff --git a/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py b/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py
new file mode 100644
index 000000000000..44435cc21135
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py
@@ -0,0 +1,174 @@
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf.omegaconf import OmegaConf
+from transformers import AutoModelForCausalLM
+
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_griffin_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--input_name_or_path", type=str, default="google/recurrentgemma-2b")
+    parser.add_argument(
+        "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+
+    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
+    hf_model = AutoModelForCausalLM.from_pretrained(args.input_name_or_path, device_map="auto")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+
+    nemo_model_from_hf = MegatronGriffinModel(nemo_config.model, trainer)
+
+    new_state_dict = {}
+
+    new_state_dict['model.embedding.word_embeddings.weight'] = hf_model.state_dict()['model.embed_tokens.weight']
+    new_state_dict['model.decoder.final_layernorm.weight'] = hf_model.state_dict()['model.final_norm.weight']
+
+    for l in range(nemo_config.model.num_layers):
+        print(f"Converting Layer {l}")
+        print("********************")
+
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'] = torch.cat(
+            [
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.gate_proj.weight'],
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.up_proj.weight'],
+            ]
+        )
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.bias'] = torch.cat(
+            [
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.gate_proj.bias'],
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.up_proj.bias'],
+            ]
+        ).flatten()
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2.weight'] = hf_model.state_dict()[
+            f'model.layers.{l}.mlp_block.down_proj.weight'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2.bias'] = hf_model.state_dict()[
+            f'model.layers.{l}.mlp_block.down_proj.bias'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1._extra_state'] = nemo_model_from_hf.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc1._extra_state'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2._extra_state'] = nemo_model_from_hf.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2._extra_state'
+        ]
+
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'] = hf_model.state_dict()[
+            f'model.layers.{l}.channel_pre_norm.weight'
+        ]
+
+        if l % 3 == 2:
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.o_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_proj.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.o_proj.bias'
+            ]
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            ] = hf_model.state_dict()[f'model.layers.{l}.temporal_pre_norm.weight']
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.q_proj.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.k_proj.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.v_proj.weight'],
+                ]
+            )
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'] = torch.zeros(
+                new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'].shape[0]
+            )
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_proj._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj._extra_state']
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv._extra_state']
+
+        else:
+
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in.layer_norm_weight'
+            ] = hf_model.state_dict()[f'model.layers.{l}.temporal_pre_norm.weight']
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_in.weight'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_x.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_y.weight'],
+                ]
+            )
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_in.bias'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_x.bias'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_y.bias'],
+                ]
+            )
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_out.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.linear_out.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_out.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.linear_out.bias'
+            ]
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.conv_1d.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.conv_1d.bias'
+            ]
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_param'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_param'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.w'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.input_gate_weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.b'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.input_gate_bias'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.w'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_gate_weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.b'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_gate_bias'
+            ]
+
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in._extra_state']
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_out._extra_state']
+
+    nemo_model_from_hf.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    nemo_model_from_hf = nemo_model_from_hf.to(dtype=dtype)
+
+    nemo_model_from_hf.save_to(args.output_path)
+    logging.info(f'Griffin NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py b/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py
new file mode 100644
index 000000000000..265af9e55cbd
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py
@@ -0,0 +1,147 @@
+import os
+from argparse import ArgumentParser
+
+from omegaconf.omegaconf import OmegaConf
+from transformers import AutoConfig, RecurrentGemmaModel
+
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_griffin_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--input_path", type=str, default=None, required=True)
+    parser.add_argument(
+        "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+
+    logging.info(f"Loading checkpoint from NeMo: `{args.input_path}`")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+
+    nemo_model = MegatronGriffinModel.restore_from(args.input_path, trainer=trainer)
+    hf_config = AutoConfig.from_pretrained("google/recurrentgemma-2b")
+
+    # NeMo doesn't support LM Head for Griffin yet, so RecurrentGemmaModel is used instead of AutoModelForCausalLM
+    hf_model = RecurrentGemmaModel._from_config(hf_config)
+
+    new_state_dict = {}
+
+    new_state_dict['embed_tokens.weight'] = nemo_model.state_dict()['model.embedding.word_embeddings.weight']
+    new_state_dict['final_norm.weight'] = nemo_model.state_dict()['model.decoder.final_layernorm.weight']
+
+    for l in range(nemo_config.model.num_layers):
+        print(f"Converting Layer {l}")
+        print("********************")
+
+        (
+            new_state_dict[f'layers.{l}.mlp_block.gate_proj.weight'],
+            new_state_dict[f'layers.{l}.mlp_block.up_proj.weight'],
+        ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'].chunk(2)
+        (
+            new_state_dict[f'layers.{l}.mlp_block.gate_proj.bias'],
+            new_state_dict[f'layers.{l}.mlp_block.up_proj.bias'],
+        ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.bias'].chunk(2)
+        new_state_dict[f'layers.{l}.mlp_block.down_proj.weight'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        ]
+        new_state_dict[f'layers.{l}.mlp_block.down_proj.bias'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2.bias'
+        ]
+
+        new_state_dict[f'layers.{l}.channel_pre_norm.weight'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        ]
+
+        if l % 3 == 2:
+
+            new_state_dict[f'layers.{l}.temporal_block.o_proj.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.o_proj.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_proj.bias'
+            ]
+            new_state_dict[f'layers.{l}.temporal_pre_norm.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            ]
+            (
+                new_state_dict[f'layers.{l}.temporal_block.q_proj.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.k_proj.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.v_proj.weight'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'].split(
+                [2560, 256, 256]
+            )
+
+        else:
+
+            new_state_dict[f'layers.{l}.temporal_pre_norm.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in.layer_norm_weight'
+            ]
+            (
+                new_state_dict[f'layers.{l}.temporal_block.linear_x.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.linear_y.weight'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in.weight'].chunk(2)
+            (
+                new_state_dict[f'layers.{l}.temporal_block.linear_x.bias'],
+                new_state_dict[f'layers.{l}.temporal_block.linear_y.bias'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in.bias'].chunk(2)
+
+            new_state_dict[f'layers.{l}.temporal_block.linear_out.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.linear_out.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out.bias'
+            ]
+
+            new_state_dict[f'layers.{l}.temporal_block.conv_1d.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.conv_1d.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.bias'
+            ]
+
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_param'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_param'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.input_gate_weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.w'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.input_gate_bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.b'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_gate_weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.w'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_gate_bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.b'
+            ]
+
+    hf_model.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    hf_model = hf_model.to(dtype=dtype)
+
+    hf_model.save_pretrained(args.output_path)
+    logging.info(f'Full HF model model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index c8ccf50aa05f..e1dc00c77439 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -27,7 +27,7 @@
 import torch
 from omegaconf import OmegaConf
 from pytorch_lightning.trainer.trainer import Trainer
-from transformers import LlamaForCausalLM, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -78,7 +78,19 @@ def load_config(args, llama_config):
         nemo_config.num_query_groups = llama_config['num_key_value_heads']
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'fast-swiglu'
-    nemo_config.tokenizer.model = llama_config['tokenizer_model']
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
     if llama_config['rope_scaling'] is not None:
         if llama_config['rope_scaling']['type'] == 'linear':
             nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
@@ -98,9 +110,12 @@ def load_config(args, llama_config):
 def convert(args):
     logging.info(f"loading checkpoint {args.input_name_or_path}")
     model = LlamaForCausalLM.from_pretrained(args.input_name_or_path)
-    tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
     hf_config = vars(model.config)
-    hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
     print(f"hf_config: {hf_config}")
     print("named parameters:")
     for name, param in model.named_parameters():
@@ -274,6 +289,15 @@ def convert(args):
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
+    if 'tokenizer_model' not in hf_config:
+        if hf_config['num_hidden_layers'] == 32:
+            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+        elif hf_config['num_hidden_layers'] == 80:
+            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+        else:
+            logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+
     # cast to target precision and disable cpu init
     dtype = torch_dtype_from_precision(precision)
     model = model.to(dtype=dtype)
diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
index 159676f8b58e..8da15148dfd8 100644
--- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
@@ -263,5 +263,5 @@ def replace_hf_weights_and_tokenizer(
             args.hf_output_tokenizer,
         )
     else:
-        logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
         logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 98143c0328ec..8183b0d142c1 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -53,14 +53,15 @@ def get_args():
         "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface Mixtral checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    valid_precision_values = [16, '16', 'bf16', '16-mixed', 'bf16-mixed', 32, '32']
+    parser.add_argument("--precision", type=str, default="32", choices=valid_precision_values, help="Model precision")
     parser.add_argument('--low-ram', action='store_true')
     parser.add_argument('--tmp-dir', default='/tmp/mixtral_ckpt_parts/')
     args = parser.parse_args()
     return args
 
 
-def load_model(cls, checkpoint, strict, **kwargs):
+def restore_model_from_checkpoint(cls, checkpoint, strict, **kwargs):
     try:
         if 'cfg' in kwargs:
             model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
@@ -68,7 +69,8 @@ def load_model(cls, checkpoint, strict, **kwargs):
             model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
             for name, module in model.named_parameters():
                 if name in checkpoint['state_dict']:
-                    module.data = checkpoint['state_dict'][name]
+                    # cast to target precision and
+                    module.data = checkpoint['state_dict'][name].to(dtype=module.data.dtype)
                     checkpoint['state_dict'].pop(name)
                 else:
                     print(f"Unexpected key: {name} not in checkpoint but in model.")
@@ -160,21 +162,24 @@ def load_mixtral_ckpt(in_dir, load_model=True):
     return model_args, ckpt, tokenizer
 
 
-def make_trainer(args, nemo_config):
-    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
-    nemo_config = load_config(model_args, tokenizer.vocab_file)
-
-    if args.precision in ["32", "16"]:
-        precision = int(float(args.precision))
-    elif args.precision in ["bf16", "bf16-mixed"]:
+def parse_precision(precision):
+    if precision in ["32", "16"]:
+        return int(float(precision))
+    elif precision in ["bf16", "bf16-mixed"]:
         if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-            precision = args.precision
+            return precision
         else:
             logging.warning("BF16 is not supported on this device. Using FP16 instead.")
-            precision = args.precision[2:]  # prune bf in string
+            return precision[2:]  # prune bf in string
     else:
-        precision = args.precision
+        return precision
+
 
+def make_trainer(args, nemo_config):
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
+    nemo_config = load_config(model_args, tokenizer.vocab_file)
+
+    precision = parse_precision(args.precision)
     plugins = []
     if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
         scaler = None
@@ -363,11 +368,123 @@ def merge(a: dict, b: dict, path=[]):
     return a
 
 
+def init_spm(spm_model_cls):
+    from google.protobuf.json_format import Parse, ParseDict
+
+    src = {
+        "trainerSpec": {
+            "modelPrefix": "tok_v0",
+            "modelType": "BPE",
+            "vocabSize": 32000,
+            "selfTestSampleSize": 0,
+            "inputFormat": "text",
+            "characterCoverage": 0.99995,
+            "inputSentenceSize": "200000000",
+            "seedSentencepieceSize": 1000000,
+            "shrinkingFactor": 0.75,
+            "numThreads": 80,
+            "numSubIterations": 2,
+            "maxSentenceLength": 4192,
+            "shuffleInputSentence": True,
+            "maxSentencepieceLength": 16,
+            "splitByUnicodeScript": True,
+            "splitByWhitespace": True,
+            "splitByNumber": True,
+            "treatWhitespaceAsSuffix": False,
+            "splitDigits": True,
+            "allowWhitespaceOnlyPieces": True,
+            "vocabularyOutputPieceScore": True,
+            "hardVocabLimit": True,
+            "useAllVocab": False,
+            "byteFallback": True,
+            "requiredChars": "",
+            "unkId": 0,
+            "bosId": 1,
+            "eosId": 2,
+            "padId": -1,
+            "unkSurface": " \u2047 ",
+            "unkPiece": "<unk>",
+            "bosPiece": "<s>",
+            "eosPiece": "</s>",
+            "padPiece": "<pad>",
+            "trainExtremelyLargeCorpus": False,
+            "enableDifferentialPrivacy": False,
+            "differentialPrivacyNoiseLevel": 0.0,
+            "differentialPrivacyClippingThreshold": "0",
+            "pretokenizationDelimiter": "",
+        },
+        "normalizerSpec": {
+            "name": "identity",
+            "precompiledCharsmap": "",
+            "addDummyPrefix": True,
+            "removeExtraWhitespaces": False,
+            "normalizationRuleTsv": "",
+        },
+    }
+    return ParseDict(src, spm_model_cls.ModelProto())
+
+
+def make_sentencepiece_tokenizer(hf_tok):
+    import sys
+
+    sys.path.insert(0, 'sentencepiece/python/src/sentencepiece/')
+    try:
+        import sentencepiece_model_pb2 as spm_model_cls  # import model # sentencepiece_model as model
+    except ImportError:
+        # If this fails, download sentencepiece and extract it here.
+        print(
+            "Sentencepiece was not found; run `(cd scripts/checkpoint_converters; git clone https://github.com/google/sentencepiece.git)` & retry"
+        )
+        quit()
+
+    vocab = list(hf_tok.vocab.items())
+    vocab.sort(key=lambda x: x[1])
+
+    m = init_spm(spm_model_cls)
+    prefix = 0
+    found_boundary = False
+    for token, i in vocab:
+        new_token = spm_model_cls.ModelProto().SentencePiece()
+        # print(token, len(token), type(token), i)
+        new_token.piece = token
+        if token == '<unk>':
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 2
+            new_token.score = 0
+        elif token in ['<s>', '</s>']:
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 3
+            new_token.score = 0
+        elif len(token) == 6 and token.startswith('<0x') and token[-1] == '>':
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 6
+            new_token.score = 0
+        elif set(token) == set(["▁"]):
+            if token == '▁▁':
+                found_boundary = True
+            new_token.score = -1e09
+        else:
+            new_token.score = -float(i) + prefix
+        m.pieces.append(new_token)
+
+    output_path = 'new.model'
+    with open(output_path, 'wb') as fp:
+        fp.write(m.SerializeToString())
+    return output_path
+
+
 def save_to_nemo(args, checkpoint):
 
     logging.info(f"loading checkpoint {args.input_name_or_path}")
     model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
+    if tokenizer.vocab_file is None:
+        tokenizer.vocab_file = make_sentencepiece_tokenizer(tokenizer)
     nemo_config = load_config(model_args, tokenizer.vocab_file)
+    nemo_config.precision = parse_precision(args.precision)
+    nemo_config.megatron_amp_O2 = True
     trainer, dtype = make_trainer(args, nemo_config)
 
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
@@ -379,13 +496,13 @@ def save_to_nemo(args, checkpoint):
         for key in keys:
             checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
 
-    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    model = restore_model_from_checkpoint(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
-    # cast to target precision and disable cpu init
-    model = model.to(dtype=dtype)
+    # disable cpu init
     model.cfg.use_cpu_initialization = False
+    model.cfg.perform_initialization = True
 
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index dbcbb80a7fda..58311d0324c2 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -61,7 +61,7 @@ def load_config(hf_model_name, nemo_config):
     hf_config.num_key_value_heads = nemo_config.num_query_groups
     hf_config.num_local_experts = nemo_config.num_moe_experts
     assert hf_config.num_local_experts > 0, "num_experts must be greater than zero."
-    hf_config.num_experts_per_tok = nemo_config.num_experts_per_token
+    hf_config.num_experts_per_tok = nemo_config.moe_router_topk
     assert hf_config.num_experts_per_tok > 0, "num_experts_per_token must be greater than zero."
     if nemo_config.activation == 'fast-swiglu':
         hf_config.activation = 'silu'
@@ -122,6 +122,7 @@ def convert(in_file, precision=None) -> None:
         embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
     state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name])
 
+    head_num = model.cfg.num_attention_heads
     if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
         num_query_groups = head_num
     else:
@@ -233,7 +234,7 @@ def convert(in_file, precision=None) -> None:
 
 if __name__ == '__main__':
     args = get_args()
-    parallel_state.set_cpu_expert_model_parallel_world_size(1)
+    parallel_state.set_expert_model_parallel_world_size(1)
     hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)
diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
new file mode 100644
index 000000000000..223c7af50843
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface QWen2(QWen1.5) checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_qwen2_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface QWen2 checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, qwen_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+    if qwen_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = qwen_config['rope_theta']
+    nemo_config.encoder_seq_length = qwen_config['max_position_embeddings']
+    nemo_config.num_layers = int(qwen_config['num_hidden_layers'])
+    nemo_config.hidden_size = qwen_config['hidden_size']
+    nemo_config.ffn_hidden_size = qwen_config['intermediate_size']
+    nemo_config.num_attention_heads = qwen_config['num_attention_heads']
+    nemo_config.max_position_embeddings = qwen_config['max_position_embeddings']
+    nemo_config.init_method_std = qwen_config['initializer_range']
+    nemo_config.layernorm_epsilon = qwen_config['rms_norm_eps']
+    if 'num_key_value_heads' in qwen_config:
+        nemo_config.num_query_groups = qwen_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.tokenizer.type = str(args.input_name_or_path)
+    nemo_config.tokenizer.model = str(args.input_name_or_path) + '/vocab.json'
+    nemo_config.override_vocab_size = qwen_config['vocab_size']
+
+    base = 128
+    while qwen_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model = Qwen2ForCausalLM.from_pretrained(args.input_name_or_path)
+    tokenizer = Qwen2Tokenizer.from_pretrained(args.input_name_or_path)
+    hf_config = vars(model.config)
+    print(f"hf_config: {hf_config}")
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = model.state_dict()[f'model.embed_tokens.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict
+    if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict():
+        rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq']
+        if mcore_gpt:
+            rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq'
+        else:
+            rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq'
+        checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+    if mcore_gpt:
+        assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+        qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+
+        new_q_tensor_shape = (head_num, head_size)
+        new_kv_tensor_shape = (num_query_groups, head_size)
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.bias'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.bias'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.bias'].view(*new_kv_tensor_shape)
+        qkv_bias = torch.empty((0, head_size))
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
+            qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
+            qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
+        qkv_bias = qkv_bias.reshape(
+            [
+                head_size * (head_num + 2 * num_query_groups),
+            ]
+        )
+        if mcore_gpt:
+            qkv_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'
+        else:
+            qkv_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.bias'
+        checkpoint['state_dict'][qkv_bias_base_name] = param_to_weights(qkv_bias)
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+        # MLP
+        mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']
+        mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']
+        if mcore_gpt:
+            mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+        else:
+            mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0)
+        checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+        mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']
+        if mcore_gpt:
+            mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        else:
+            mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.norm.weight']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'lm_head.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del model
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, checkpoint['state_dict'])
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # cast to target precision and disable cpu init
+    dtype = torch_dtype_from_precision(precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
new file mode 100644
index 000000000000..c6a218020c21
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from pytorch_lightning import Trainer
+from transformers import Qwen2ForCausalLM, Qwen2Tokenizer, Qwen2TokenizerFast, convert_slow_tokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+"""
+Script to convert a QWen2 checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
+
+1) Generate only HF weights from a nemo file:
+
+    python convert_qwen2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
+    
+2) Generate the full HF model folder
+
+    python convert_qwen2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder \
+    --input_tokenizer /path/to/tokenizer \
+    --hf_output_tokenizer /path/to/output_tokenizer \
+
+    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. qwen1.5 72b). 
+    However this option makes the conversion script significantly slower.
+"""
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to .nemo file or extracted folder",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument(
+        "--hf_input_path",
+        type=str,
+        default=None,
+        help="A HF model path, " "e.g. a folder containing https://huggingface.co/Qwen/Qwen1.5-72B/tree/main",
+    )
+    parser.add_argument(
+        "--hf_output_path",
+        type=str,
+        default=None,
+        help="Output HF model path, " "with the same format as above but user's own weights",
+    )
+    parser.add_argument(
+        "--input_tokenizer",
+        type=str,
+        default=None,
+        help="Path to tokenizer used for the input nemo model. (need to extract the .nemo file first)",
+    )
+    parser.add_argument(
+        "--hf_output_tokenizer",
+        type=str,
+        default=None,
+        help="Path to save the tokenizer used for the output HF model.",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        help="Precision of output weights."
+        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
+    )
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
+        "but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
+    """
+    Convert NeMo weights to HF weights
+    """
+    dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
+    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
+    model_config.tensor_model_parallel_size = 1
+    model_config.pipeline_model_parallel_size = 1
+    if cpu_only:
+        map_location = torch.device('cpu')
+        model_config.use_cpu_initialization = True
+    else:
+        map_location = None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+    model = MegatronGPTModel.restore_from(
+        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+    logging.info(f"Using precision {dtype}")
+
+    param_to_weights = lambda param: param.to(dtype)
+    checkpoint = OrderedDict()
+
+    hidden_size = model.cfg.hidden_size
+    head_num = model.cfg.num_attention_heads
+    num_layers = model.cfg.num_layers
+    ffn_hidden_size = model.cfg.ffn_hidden_size
+    num_query_groups = model.cfg.get("num_query_groups", head_num)
+
+    head_size = hidden_size // head_num
+    heads_per_group = head_num // num_query_groups
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    # Embedding
+    embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight']
+    embed_weights_base_name = f'model.embed_tokens.weight'
+    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        # qkv weight
+        qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight']
+        qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        q_weights_base_name = f'model.layers.{l}.self_attn.q_proj.weight'
+        k_weights_base_name = f'model.layers.{l}.self_attn.k_proj.weight'
+        v_weights_base_name = f'model.layers.{l}.self_attn.v_proj.weight'
+
+        checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
+        checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
+        checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
+
+        # qkv bias
+        qkv_bias = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias']
+        qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        q_bias_base_name = f'model.layers.{l}.self_attn.q_proj.bias'
+        k_bias_base_name = f'model.layers.{l}.self_attn.k_proj.bias'
+        v_bias_base_name = f'model.layers.{l}.self_attn.v_proj.bias'
+
+        checkpoint[q_bias_base_name] = param_to_weights(
+            qkv_bias[q_slice].reshape(
+                -1,
+            )
+        )
+        checkpoint[k_bias_base_name] = param_to_weights(
+            qkv_bias[k_slice].reshape(
+                -1,
+            )
+        )
+        checkpoint[v_bias_base_name] = param_to_weights(
+            qkv_bias[v_slice].reshape(
+                -1,
+            )
+        )
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj.weight']
+        o_weight_base_name = f'model.layers.{l}.self_attn.o_proj.weight'
+        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
+
+        # mlp
+        mlp_weights = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight']
+        mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :]
+        mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :]
+
+        mlp_down_proj_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
+        mlp_gate_proj_base_name = f'model.layers.{l}.mlp.up_proj.weight'
+
+        checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
+        checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight)
+
+        mlp_up_proj_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc2.weight']
+        mlp_up_proj_base_name = f'model.layers.{l}.mlp.down_proj.weight'
+        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
+
+        # layernorm
+        input_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight']
+        input_ln_base_name = f'model.layers.{l}.input_layernorm.weight'
+        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight']
+        post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight'
+        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.decoder.final_layernorm.weight']
+    final_ln_base_name = f'model.norm.weight'
+    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'model.output_layer.weight']
+    output_layer_base_name = f'lm_head.weight'
+    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
+    torch.save(checkpoint, output_hf_file)
+    logging.info(f"Weights saved to {output_hf_file}")
+
+    return dtype
+
+
+def replace_hf_weights_and_tokenizer(
+    weights_file,
+    dtype,
+    input_hf_path,
+    output_hf_path,
+    tokenizer_path,
+    output_hf_tokenizer,
+):
+    model = Qwen2ForCausalLM.from_pretrained(
+        input_hf_path,
+        local_files_only=True,
+        torch_dtype=dtype,
+    )
+    nemo_exported = torch.load(weights_file)
+
+    if tokenizer_path:
+        tokenizer = Qwen2Tokenizer.from_pretrained(
+            tokenizer_path,
+            local_files_only=True,
+            legacy=False,
+        )
+        tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer)
+        fast_tokenizer = Qwen2TokenizerFast(tokenizer_object=tmp_tokenizer)
+        tokenizer_length = len(fast_tokenizer)
+        model.resize_token_embeddings(tokenizer_length)
+
+    model.load_state_dict(nemo_exported)
+    model.save_pretrained(output_hf_path)
+    logging.info(f"Full HF model saved to {output_hf_path}")
+
+    if tokenizer_path:
+        fast_tokenizer.save_pretrained(output_hf_tokenizer)
+        tokenizer.save_pretrained(output_hf_tokenizer)
+        logging.info(f"Tokenizer saved to {output_hf_tokenizer}")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    if not args.hf_output_tokenizer and args.hf_output_path:
+        args.hf_output_tokenizer = args.hf_output_path
+    dtype = convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
+        replace_hf_weights_and_tokenizer(
+            args.output_path,
+            dtype,
+            args.hf_input_path,
+            args.hf_output_path,
+            args.input_tokenizer,
+            args.hf_output_tokenizer,
+        )
+    else:
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
index b7b85ee826a8..043d1fd35261 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -266,7 +266,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     config = load_config(args.hf_model_name, nemo_config)
     model = AutoModelForCausalLM.from_config(config)
     model.load_state_dict(hf_state_dict, strict=True)
-    model.save_pretrained(args.out_file)
+    model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained('bigcode/starcoder2-tokenizer')
     hf_tokenizer.save_pretrained(args.output_path)
     logging.info(f'HF checkpoint saved to: {args.output_path}')
diff --git a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
index 9dceba544068..e600c65e6de1 100644
--- a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
@@ -147,7 +147,6 @@ def get_new_key(old_key):
         "encoder_seq_length": hf_config.n_positions,
         "max_position_embeddings": hf_config.n_positions,
         "num_layers": hf_config.n_layer,
-        "cpu_offloading_num_layers": hf_config.n_layer - 1,  # @chcui temp workaround before m-lm !1124 is merged
         "num_attention_heads": hf_config.n_head,
         "ffn_hidden_size": hf_config.n_inner,
         "layernorm_epsilon": hf_config.layer_norm_epsilon,
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
new file mode 100644
index 000000000000..29b56aa706fa
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert zarr checkpoints into torch distributed checkpoint.
+  Example to run this conversion script:
+    python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+     megatron_zarr_ckpt_to_torch_dist.py \
+     --model_type <model_type> \
+     --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+     --checkpoint_name <checkpoint_name> \
+     --path_to_save <path_to_output_ckpt_files> \
+     --tensor_model_parallel_size <tensor_model_parallel_size> \
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --hparams_file <path_to_model_yaml_config> \
+     --gpus_per_node <gpus_per_node>
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from megatron.core import parallel_state
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.utils import AppState, logging
+from nemo.utils.distributed import initialize_distributed
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_folder",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to PTL checkpoints saved during training. Ex: /raid/nemo_experiments/megatron_gpt/checkpoints",
+    )
+    parser.add_argument(
+        "--checkpoint_name",
+        type=str,
+        default=None,
+        required=True,
+        help="Name of checkpoint to be used. Ex: megatron_gpt--val_loss=0.14-step=20-consumed_samples=160.0-last",
+    )
+
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=None,
+        required=True,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
+    parser.add_argument(
+        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+    )
+    parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
+    parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
+    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
+    parser.add_argument(
+        "--precision",
+        type=str,
+        required=False,
+        default='bf16-mixed',
+        choices=['32-true', '16-mixed', 'bf16-mixed'],
+        help="Precision value for the trainer that matches with precision of the ckpt",
+    )
+
+    parser.add_argument(
+        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def convert(local_rank, rank, world_size, args):
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    num_nodes = world_size // args.gpus_per_node
+
+    cfg = {
+        'trainer': {
+            'devices': args.gpus_per_node,
+            'num_nodes': num_nodes,
+            'accelerator': 'gpu',
+            'precision': args.precision,
+        },
+        'model': {
+            'native_amp_init_scale': 2 ** 32,
+            'native_amp_growth_interval': 1000,
+            'hysteresis': 2,
+            'gradient_as_bucket_view': True,
+        },
+        'cluster_type': args.cluster_type,
+    }
+    cfg = OmegaConf.create(cfg)
+
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
+    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+    app_state.pipeline_model_parallel_split_rank = None
+
+    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
+
+    parallel_state.initialize_model_parallel(
+        tensor_model_parallel_size=app_state.tensor_model_parallel_size,
+        pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
+    )
+
+    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
+    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank()
+
+    # check for distributed checkpoint
+    checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name)
+
+    logging.info(
+        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
+    )
+
+    if args.model_type == "gpt":
+        model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
+    elif args.model_type == "sft":
+        model = MegatronGPTSFTModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
+        )
+        # we force the target for the loaded model to have the correct target
+        # because the hparams.yaml sometimes contains MegatronGPTModel as the target.
+        with open_dict(model.cfg):
+            model.cfg.target = f"{MegatronGPTSFTModel.__module__}.{MegatronGPTSFTModel.__name__}"
+    elif args.model_type == 'bert':
+        model = MegatronBertModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
+        )
+
+    with open_dict(model.cfg):
+        model.cfg.torch_distributed_checkpoint = True
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+    save_file_path = args.path_to_save
+    if not args.save_to_nemo:
+        # With --save_to_nemo, save_to_path is expected to be a directory.
+        # Adding a dummy model filename here conforms with SaveRestoreConnector's convention.
+        model._save_restore_connector.pack_nemo_file = False
+        save_file_path = os.path.join(save_file_path, 'model.nemo')
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    model.save_to(save_file_path)
+
+    logging.info(f'NeMo model saved to: {args.path_to_save}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    local_rank, rank, world_size = initialize_distributed(args)
+
+    convert(local_rank, rank, world_size, args)
diff --git a/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py
new file mode 100644
index 000000000000..7b3099714669
--- /dev/null
+++ b/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example usage of this script:
+/checkpoints/bin/ is a folder containing the HF lora checkpoint (usually named adapter_model.bin)
+and a HF lora config file (usually named adapter_config.json)
+python scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py \
+    --hf_lora_path /checkpoints/bin/ \
+    --output_path output_dir/converted_lora.nemo \
+    --nemo_config model_config.yaml
+"""
+
+import json
+import tempfile
+from argparse import ArgumentParser
+from typing import Dict
+
+import torch
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+
+target_map = {
+    "all": ["gate_proj", "o_proj", "up_proj", "down_proj", "k_proj", "q_proj", "v_proj"],
+    "attention_qkv": ["k_proj", "q_proj", "v_proj"],
+    "attention_dense": ["gate_proj", "o_proj", "up_proj"],
+}
+
+
+def map_target_modules_to_canonical(target_modules):
+    tm = set(target_modules)
+    for k, v in target_map.items():
+        if tm == set(v):
+            return [k]
+    raise ValueError(f"Unknown target modules: {target_modules}")
+
+
+def reformat_module_names_to_canonical(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    new_tensors = dict()
+    for module_name, module_weight in tensors.items():
+        # map linear_in and linear_out to lora_a/lora_b counterparts
+        new_module_name = (
+            module_name.replace("lora_A", "linear_in").replace("lora_B", "linear_out").replace("base_model.", "")
+        )
+
+        new_module_name = new_module_name.replace(".q_proj", ".adapter_layer.lora_unfused_kqv_adapter.q_adapter")
+        new_module_name = new_module_name.replace(".k_proj", ".adapter_layer.lora_unfused_kqv_adapter.k_adapter")
+        new_module_name = new_module_name.replace(".v_proj", ".adapter_layer.lora_unfused_kqv_adapter.v_adapter")
+        new_module_name = new_module_name.replace(".o_proj", ".adapter_layer.lora_dense_attention_adapter")
+        new_module_name = new_module_name.replace(".down_proj", ".adapter_layer.lora_4htoh_adapter")
+        new_module_name = new_module_name.replace(
+            ".gate_proj", ".adapter_layer.lora_unfused_hto4h_adapter.gate_adapter"
+        )
+        new_module_name = new_module_name.replace(".up_proj", ".adapter_layer.lora_unfused_hto4h_adapter.up_adapter")
+        new_module_name = new_module_name.replace("self_attn", "self_attention")
+        new_module_name = new_module_name.replace("model.model", "model.decoder")
+
+        new_tensors[new_module_name] = module_weight
+    return new_tensors
+
+
+def convert_lora(lora_hf_path, save_path, lora_yaml):
+    config_file = f"{lora_hf_path}/adapter_config.json"
+    model_file = f"{lora_hf_path}/adapter_model.bin"
+    hf_lora_config = json.loads(open(config_file).read())
+    model = torch.load(model_file)
+    # TODO: currently suport tp=1
+    lora_state_dict = reformat_module_names_to_canonical(model)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        nemo_lora_config = OmegaConf.load(lora_yaml)
+        with open_dict(nemo_lora_config):
+            nemo_lora_config.peft.lora_tuning.variant = "canonical"
+            nemo_lora_config.peft.lora_tuning.adapter_dim = hf_lora_config["r"]
+            nemo_lora_config.peft.lora_tuning.alpha = hf_lora_config["lora_alpha"]
+            nemo_lora_config.peft.lora_tuning.target_modules = map_target_modules_to_canonical(
+                hf_lora_config["target_modules"]
+            )
+
+        with open(f"{tmpdir}/model_config.yaml", "w") as f:
+            OmegaConf.save(nemo_lora_config, f)
+        torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
+        NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir)
+
+    return True
+
+
+def fix_for_O2(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if "model.module." not in k:
+            new_state_dict[k.replace('model.', 'model.module.')] = v
+    return new_state_dict
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hf_lora_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo style (fused) lora checkpoint in .nemo file format",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to save the canonical (unfused) lora .nemo file.",
+    )
+    parser.add_argument("--nemo_config", type=str, help="a model_config.yaml file which this script will update.")
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert_lora(args.hf_lora_path, args.output_path, args.nemo_config)
diff --git a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
new file mode 100644
index 000000000000..65a00fd56d22
--- /dev/null
+++ b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert nemo style (fused) lora checkpoint to canonical (unfused) lora checkpoint.
+Currently supports TP=PP=1 only.
+
+Example usage:
+python scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py \
+    --nemo_lora_path nemo_style_lora_model.nemo \
+    --output_path ./canonical_style_lora_model.nemo 
+
+Example usage to also convert into huggingface format (the script expects a adapter_config.json file which is standard in HF):
+python scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py \
+    --nemo_lora_path nemo_style_lora_model.nemo \
+    --output_path ./canonical_style_lora_model.nemo \
+    --hf_format --hf_config checkpoints/bin/adapter_config.json
+"""
+import json
+import tempfile
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+from omegaconf import OmegaConf, open_dict
+from scripts.nlp_language_modeling.merge_lora_weights.merge import replace_number_add_offset
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+
+target_map = {
+    "all": ["gate_proj", "o_proj", "up_proj", "down_proj", "k_proj", "q_proj", "v_proj"],
+    "attention_qkv": ["k_proj", "q_proj", "v_proj"],
+    "attention_dense": ["gate_proj", "o_proj", "up_proj"],
+}
+
+
+def rename_keys(key):
+    new_keys = []
+    if "lora_kqv_adapter" in key:
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
+    elif "lora_hto4h_adapter" in key:
+        new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter."))
+        new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter."))
+    return new_keys
+
+
+def rename_qkv_keys(key):
+    new_keys = []
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
+    return new_keys
+
+
+def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    new_tensors = dict()
+    for module_name, module_weight in tensors.items():
+        # map linear_in and linear_out to lora_a/lora_b counterparts
+        new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B")
+
+        # map target modules to their vLLM/HF counterparts
+        new_module_name = new_module_name.replace("q_adapter", "q_proj")
+        new_module_name = new_module_name.replace("k_adapter", "k_proj")
+        new_module_name = new_module_name.replace("v_adapter", "v_proj")
+        new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj")
+        new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj")
+        new_module_name = new_module_name.replace("gate_adapter", "gate_proj")
+        new_module_name = new_module_name.replace("up_adapter", "up_proj")
+
+        # map other parts of the module names to fit vLLM/huggingface
+        new_module_name = new_module_name.replace(".adapter_layer", "")
+        new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "")
+        new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "")
+        new_module_name = new_module_name.replace("self_attention", "self_attn")
+        new_module_name = new_module_name.replace("decoder", "model")
+
+        new_tensors[new_module_name] = module_weight
+    return new_tensors
+
+
+def convert_lora_weights_to_canonical(
+    config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """This function converts nemo style (fused) lora weights to canonical (unfused)
+    LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers.
+
+    Returns:
+        Dict[str, torch.Tensor]: The new LoRA weights with unfused layers.
+    """
+
+    hidden_size = int(config["hidden_size"])
+    num_heads = int(config["num_attention_heads"])
+    head_size = hidden_size // num_heads
+    num_query_groups = int(config.get("num_query_groups", num_heads))  # num_kv_heads
+
+    heads_per_group = num_heads // num_query_groups
+    qkv_total_dim = num_heads + 2 * num_query_groups
+
+    adapter_size = config['peft']['lora_tuning']['adapter_dim']
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group)
+            for group_idx in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2)
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2)
+
+    qkv_keys_to_update = []
+    hto4h_keys_to_update = []
+    for key in lora_weights.keys():
+        if "lora_kqv_adapter" in key:
+            qkv_keys_to_update.append(key)
+        if "lora_hto4h_adapter" in key:
+            hto4h_keys_to_update.append(key)
+
+    # unfuse QKV layer
+    for key in qkv_keys_to_update:
+        if "linear_in" in key:
+            assert lora_weights[key].size(0) == adapter_size
+            for new_key in rename_qkv_keys(key):
+                lora_weights[new_key] = lora_weights[key]
+                assert len(lora_weights[new_key].size()) == 2
+        elif "linear_out" in key:
+            assert lora_weights[key].size(1) == adapter_size
+            for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]):
+                lora_weights[new_key] = (
+                    lora_weights[key]
+                    .reshape((qkv_total_dim, head_size, adapter_size))[size]
+                    .reshape((-1, adapter_size))
+                )
+                assert len(lora_weights[new_key].size()) == 2
+        lora_weights.pop(key)
+
+    # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj
+    for key in hto4h_keys_to_update:
+        gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.")
+        up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.")
+
+        module_weight = lora_weights[key]
+        if "linear_in" in key:
+            # lora_a gets duplicated
+            lora_weights[gate_proj_key] = module_weight
+            lora_weights[up_proj_key] = module_weight
+        elif "linear_out" in key:
+            # lora_b gets split
+            split_size = module_weight.shape[0]
+            gate_up_split = module_weight.split(split_size // 2)
+            lora_weights[gate_proj_key] = gate_up_split[0]
+            lora_weights[up_proj_key] = gate_up_split[1]
+        lora_weights.pop(key)
+    return lora_weights
+
+
+def convert_lora(lora_nemo, save_path, hf_format=False):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        NLPSaveRestoreConnector._unpack_nemo_file(lora_nemo, tmpdir)
+        config_file = f"{tmpdir}/model_config.yaml"
+        lora_config = OmegaConf.load(config_file)
+        tp_size = lora_config.tensor_model_parallel_size
+        pp_size = lora_config.pipeline_model_parallel_size
+
+        lora_state_dict = [{}] * tp_size
+
+        for pp in range(pp_size):
+            for tp in range(tp_size):
+                if tp_size == 1:
+                    ckpt_file = f"{tmpdir}/model_weights.ckpt"
+                elif pp_size == 1:
+                    ckpt_file = f"{tmpdir}/mp_rank_{tp:02d}/model_weights.ckpt"
+                else:
+                    ckpt_file = f"{tmpdir}/tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"
+
+                l = torch.load(ckpt_file, map_location=torch.device('cpu'))
+                if pp == 0:
+                    lora_state_dict[tp] = l
+                else:
+                    # calculate layer offset
+                    layer_offset = lora_config.num_layers // pp_size * pp
+                    for key, value in l.items():
+                        new_key = replace_number_add_offset(key, layer_offset)
+                        lora_state_dict[tp][new_key] = value
+
+        # TODO: currently suport tp=1
+        lora_state_dict = lora_state_dict[0]
+        if lora_config.peft.lora_tuning.variant == "nemo":
+            with open_dict(lora_config):
+                lora_config.peft.lora_tuning.variant = "canonical"
+            with open(f"{tmpdir}/model_config.yaml", "w") as f:
+                OmegaConf.save(lora_config, f)
+            lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict)
+        if hf_format:
+            lora_state_dict = reformat_module_names_to_hf(lora_state_dict)
+            Path(save_path).mkdir(parents=True, exist_ok=True)
+            torch.save(lora_state_dict, f"{save_path}/adapter_model.bin")
+            adapter_config = json.load(open(args.hf_config))
+            adapter_config['peft_type'] = "LORA"
+            adapter_config['r'] = lora_config.peft.lora_tuning.adapter_dim
+            adapter_config['lora_alpha'] = lora_config.peft.lora_tuning.alpha
+            with open(f"{save_path}/adapter_config.json", "w") as f:
+                json.dump(adapter_config, f, indent=4)
+        else:
+            torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
+            NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir)
+
+    return lora_state_dict, lora_config
+
+
+def fix_for_O2(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if "model.module." not in k:
+            new_state_dict[k.replace('model.', 'model.module.')] = v
+    return new_state_dict
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--nemo_lora_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo style (fused) lora checkpoint in .nemo file format",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to save the canonical (unfused) lora .nemo file.",
+    )
+    parser.add_argument("--hf_format", action='store_true', help="saves tensors in huggingface naming format.")
+    parser.add_argument("--hf_config", type=str, help="the adapter config in HF-PEFT format.")
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert_lora(args.nemo_lora_path, args.output_path, args.hf_format)
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index aa896e924584..5a2440b0fa2f 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -27,7 +27,8 @@
 
 def get_args(argv):
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton",
     )
     parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
     parser.add_argument(
@@ -73,18 +74,20 @@ def get_args(argv):
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-dcf",
-        "--disable_context_fmha",
+        "-drip",
+        "--disable_remove_input_padding",
         default=False,
         action='store_true',
-        help="Disable fused Context MultiHeadedAttention (required for V100 support).",
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -101,7 +104,6 @@ def get_args(argv):
         '--use_lora_plugin',
         nargs='?',
         const=None,
-        default=False,
         choices=['float16', 'float32', 'bfloat16'],
         help="Activates the lora plugin which enables embedding sharing.",
     )
@@ -109,7 +111,16 @@ def get_args(argv):
         '--lora_target_modules',
         nargs='+',
         default=None,
-        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
         help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
     )
     parser.add_argument(
@@ -122,6 +133,13 @@ def get_args(argv):
     parser.add_argument(
         "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
     )
+    parser.add_argument(
+        "-ucr",
+        '--use_cpp_runtime',
+        default=False,
+        action='store_true',
+        help='Use TensorRT LLM C++ runtime',
+    )
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
 
     args = parser.parse_args(argv)
@@ -195,7 +213,12 @@ def nemo_deploy(argv):
                 )
                 return
 
-    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
+        use_python_runtime=(not args.use_cpp_runtime),
+    )
 
     if args.nemo_checkpoint is not None:
         try:
@@ -209,9 +232,11 @@ def nemo_deploy(argv):
                 max_input_token=args.max_input_len,
                 max_output_token=args.max_output_len,
                 max_batch_size=args.max_batch_size,
+                max_num_tokens=args.max_num_tokens,
+                opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-                paged_kv_cache=args.use_paged_kv_cache,
-                enable_context_fmha=not args.disable_context_fmha,
+                paged_kv_cache=(not args.no_paged_kv_cache),
+                remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
                 use_lora_plugin=args.use_lora_plugin,
@@ -236,7 +261,8 @@ def nemo_deploy(argv):
                 )
             )
             trt_llm_exporter.add_prompt_table(
-                task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name=str(task_id),
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
     except Exception as error:
         LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index 9798473dd880..a9c16bf8cff6 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -45,26 +45,28 @@ def get_args(argv):
     parser.add_argument(
         "-dt",
         "--dtype",
-        choices=["bf16", "fp16", "fp8", "int8"],
-        default="bf16",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
         type=str,
         help="dtype of the model on TensorRT-LLM",
     )
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-uib",
-        "--use_inflight_batching",
-        default=False,
-        action='store_true',
-        help="Enable inflight batching for TensorRT-LLM Triton backend.",
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-drip",
+        "--disable_remove_input_padding",
+        default=False,
+        action='store_true',
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -78,7 +80,6 @@ def get_args(argv):
         '--use_lora_plugin',
         nargs='?',
         const=None,
-        default=False,
         choices=['float16', 'float32', 'bfloat16'],
         help="Activates the lora plugin which enables embedding sharing.",
     )
@@ -86,7 +87,16 @@ def get_args(argv):
         '--lora_target_modules',
         nargs='+',
         default=None,
-        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
         help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
     )
     parser.add_argument(
@@ -113,7 +123,7 @@ def nemo_export_trt_llm(argv):
     LOGGER.info("Logging level set to {}".format(loglevel))
     LOGGER.info(args)
 
-    if args.dtype != "bf16":
+    if args.dtype != "bfloat16":
         LOGGER.error(
             "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. "
             "Support for the other precisions will be added in the coming releases."
@@ -121,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(
@@ -133,9 +143,12 @@ def nemo_export_trt_llm(argv):
             max_input_token=args.max_input_len,
             max_output_token=args.max_output_len,
             max_batch_size=args.max_batch_size,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            use_inflight_batching=args.use_inflight_batching,
-            paged_kv_cache=args.use_paged_kv_cache,
+            paged_kv_cache=(not args.no_paged_kv_cache),
+            remove_input_padding=(not args.disable_remove_input_padding),
+            dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,
             use_lora_plugin=args.use_lora_plugin,
             lora_target_modules=args.lora_target_modules,
diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
index f01aa54fc265..b3251e75c84e 100644
--- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import collections
 import os
 from dataclasses import dataclass
-from typing import Tuple
+from typing import TYPE_CHECKING, Tuple
 
 import numpy as np
-from tqdm import tqdm
 
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
+from nemo.utils.sequence_packing_utils import create_hist, create_packing_strategy, fill_packing_strategy
+
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
 
 """ 
 Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format.
@@ -45,146 +46,71 @@
 python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
    model.data.train_ds.file_names=[/path/to/training.jsonl] \
    model.data.train_ds.max_seq_length=2048 \
-   model.restore_from_path=<path/to/nemo_model> \
-   +output_dir=<output_folder> 
+   +tokenizer_path=/path/to/tokenizer.model
+   +output_dir=/path/to/output_folder
    +pack_sizes=[2048,4096,8192]
    
 Note: 
-- pack_sizes can take in a list 
-- model.data.train_ds.max_seq_length is the length to truncate long sequences before packing, and is different from the packing sizes
-- currenlty, we require a full nemo model file for simplicity and readability of code, but in theory only a tokenizer file is needed.
-  This part can be improved in a future iteration of the script.
+  - If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
+    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+
+  - ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
+    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
+    and can be determined by examining the distribution of sequence lengths in the dataset.
+
+  - ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
+    can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
+    the unpacked case.
 """
 
-PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
-
-
-def find_first_bin_that_fits(bins, s, bin_size):
-    for i, abin in enumerate(bins):
-        if sum(abin) + s <= bin_size:
-            return i
-    return -1
-
-
-def first_fit(seqlens, pack_size):
-    res = []
-    for s in seqlens:
-        first_bin = find_first_bin_that_fits(res, s, pack_size)
-        if first_bin == -1:  # open a new bin
-            res.append([s])
-        else:
-            res[first_bin].append(s)
-    return res
-
-
-def first_fit_decreasing(seqlens, pack_size):
-    sorted_seqlens = sorted(seqlens, reverse=True)
-    return first_fit(sorted_seqlens, pack_size)
-
 
-def first_fit_shuffle(seqlens, pack_size):
-    shuffled_seqlens = seqlens[:]
-    np.random.shuffle(shuffled_seqlens)
-    return first_fit(shuffled_seqlens, pack_size)
+def tokenize_dataset(cfg: 'DictConfig'):
+    """
+    Tokenizes a dataset using the same configuration file as finetuninng with GPTSFTDataset.
 
+    This function reads a dataset and tokenizes it using SentencePiece tokenizer based on the provided configuration.
 
-def create_assignment(output_path, assignments, ifile_handles):
-    n_samples_in_this_shard = len(assignments)
-    input_ids, loss_mask, seq_start_id = {}, {}, {}
+    Args:
+      cfg: A Hydra configuration object containing parameters for tokenization.
 
-    for oindex, assignment in tqdm(enumerate(assignments), total=n_samples_in_this_shard):
-        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+    Returns:
+      A NumPy array containing the tokenized sequences from the dataset.
+    """
 
-        for seq_length in assignment:
-            _input_ids.extend(ifile_handles[seq_length][0].pop())
-            _loss_mask.extend(ifile_handles[seq_length][1].pop())
-            _seq_start_id.append(len(_input_ids))
-
-        input_ids[oindex] = _input_ids
-        loss_mask[oindex] = _loss_mask
-        seq_start_id[oindex] = _seq_start_id[:-1]
-
-    output_data = []
-    for i in range(len(input_ids)):
-        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
-        output_data.append(item_dict)
-
-    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    np.save(output_path, output_data)
-    logging.info(f"Done, output written to {output_path}")
-
-
-def tokenize_dataset(cfg):
     logging.info("Tokenizing dataset...")
     # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
     # are identical to normal SFT training
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    # we set is_train=False to turn off samples mapping and get the actual length of train dataset
-    train_ds = model._build_dataset(cfg.model.data.train_ds, is_train=False)[0]
-    return np.array([train_ds[i] for i in range(len(train_ds))])
-
-
-def create_hist(dataset, truncate_seq_len):
-    logging.info("Creating histogram from tokenized dataset...")
-
-    sequences = collections.defaultdict(list)
-    counts = [0] * truncate_seq_len
-
-    for item_dict in dataset:
-        seq_len = len(item_dict['input_ids']) - 1
-        sequences[seq_len].append(item_dict)
-        counts[seq_len] += 1
-
-    logging.info("Histogram of sequence lengths")
-    logging.info(counts)
-
-    histogram = []
-    for seq_len in range(truncate_seq_len):
-        histogram.append(len(sequences[seq_len]))
-
-    return sequences, histogram
-
-
-def run_packing(sequences, histogram, output_dir, pack_size, packing_algorithm, seed=0):
-    logging.info(f"Packing sequences to length {pack_size}...")
-
-    all_seq_lens = []
-    for i, count in enumerate(histogram):
-        all_seq_lens.extend([i] * count)
-
-    packing_fn = globals()[packing_algorithm]
-    assignments = packing_fn(all_seq_lens, pack_size)
-    packed_seq_lens = [sum(x) for x in assignments]
-    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
-
-    logging.info("Packed sequence lengths:")
-    logging.info(packed_seq_lens)
-    logging.info(
-        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor} <<<<<"
+    data_cfg = cfg.model.data.train_ds
+    dataset = GPTSFTDataset(
+        file_path=data_cfg.file_names[0],
+        tokenizer=get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path),
+        max_seq_length=data_cfg.max_seq_length,
+        min_seq_length=data_cfg.min_seq_length,
+        pad_seq_length_to_mult=16,  # adds padding in collate_fn so this value is irrelevant here
+        add_bos=data_cfg.get('add_bos', False),
+        add_eos=data_cfg.get('add_eos', True),
+        add_sep=data_cfg.get('add_sep', False),
+        sep_id=cfg.get('sep_id', 49704),
+        max_num_samples=None,
+        seed=data_cfg.get('seed', 1234),
+        label_key=data_cfg.get('label_key', 'answer'),
+        answer_only_loss=cfg.get('answer_only_loss', True),
+        truncation_field=data_cfg.get('truncation_field', 'text'),
+        pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+        index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+        prompt_template=data_cfg.get('prompt_template', None),
+        virtual_tokens=0,
+        tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
+        memmap_workers=data_cfg.get('memmap_workers', None),
+        hf_dataset=data_cfg.get('hf_dataset', False),
+        truncation_method=data_cfg.get('truncation_method', 'right'),
+        special_tokens=data_cfg.get('chat_prompt_tokens', None),
+        is_test=True,
     )
 
-    ifile_handles = {}
-    for seq_len in tqdm(range(pack_size + 1)):
-        per_seq_data = sequences[seq_len]
-        if len(per_seq_data) > 0:
-            input_ids = np.array([x['input_ids'] for x in per_seq_data])
-            loss_mask = np.array(
-                [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
-            )
-            perm = np.random.permutation(len(input_ids))
-            ifile_handles[seq_len] = (input_ids[perm].tolist(), loss_mask[perm].tolist())
-        else:
-            ifile_handles[seq_len] = [], []
-
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, f'packed_{pack_size}_seed{seed}.npy')
-    create_assignment(output_path, assignments, ifile_handles)
+    return np.array([dataset[i] for i in range(len(dataset))])
 
 
 @dataclass
@@ -194,7 +120,7 @@ class PackingArgs:
     packing_algorithm: str = "first_fit_shuffle"
     seed: int = 0
 
-    def from_config(self, cfg):
+    def from_config(self, cfg: 'DictConfig'):
         for required_arg in ('output_dir', 'pack_sizes'):
             assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..."
         self.output_dir = cfg.output_dir
@@ -207,12 +133,20 @@ def from_config(self, cfg):
 @hydra_runner(
     config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config"
 )
-def main(cfg) -> None:
+def main(cfg: 'DictConfig') -> None:
     args = PackingArgs().from_config(cfg)
     dataset = tokenize_dataset(cfg)
     sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
     for pack_size in args.pack_sizes:
-        run_packing(sequences, histogram, args.output_dir, pack_size, args.packing_algorithm, args.seed)
+        assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm)
+        output_data = fill_packing_strategy(assignments, sequences, pack_size)
+
+        # save output data
+        os.makedirs(args.output_dir, exist_ok=True)
+        output_path = os.path.join(args.output_dir, f'packed_{pack_size}_seed{args.seed}.npy')
+        np.save(output_path, output_data)
+        logging.info(f"Done, output written to {output_path}")
+
     logging.info(
         f"""
 ✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
@@ -221,7 +155,9 @@ def main(cfg) -> None:
    > +model.data.train_ds.packed_sequence=True
 2. Use the new dataset file instead of the original jsonl file
    > model.data.train_ds.file_names=/path/to/packed_dataset.npy
-3. Adjust the batch sizes. 
+3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
+   > model.data.train_ds.max_seq_length=<pack_size>
+4. Adjust the batch sizes. 
    Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated 
    in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size.
    Global batch size has to be reduced by the average number of sequences per pack `n`, 
diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
index 690010ad29ca..c3b5cef57cbc 100644
--- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
+++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -124,7 +124,11 @@
 )
 
 parser.add_argument(
-    "--metadata_path", required=False, default=None, type=str, help="Path to metadata file for the dataset.",
+    "--metadata_path",
+    required=False,
+    default=None,
+    type=str,
+    help="Path to metadata file for the dataset.",
 )
 
 parser.add_argument(
@@ -165,7 +169,10 @@
 )
 
 parser.add_argument(
-    "--buckets_num", type=int, default=1, help="Number of buckets to create based on duration.",
+    "--buckets_num",
+    type=int,
+    default=1,
+    help="Number of buckets to create based on duration.",
 )
 
 parser.add_argument(
@@ -405,7 +412,7 @@ def estimate_dynamic_bucketing_duration_bins(self, manifest_path: str, num_bucke
         from lhotse.dataset.sampling.dynamic_bucketing import estimate_duration_buckets
         from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
-        cuts = CutSet(LazyNeMoIterator(manifest_path, missing_sampling_rate_ok=True))
+        cuts = CutSet(LazyNeMoIterator(manifest_path, metadata_only=True))
         bins = estimate_duration_buckets(cuts, num_buckets=num_buckets)
         print(
             f"Note: we estimated the optimal bucketing duration bins for {num_buckets} buckets. "
@@ -617,6 +624,15 @@ def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig):
         with open(manifest_path, 'r', encoding='utf-8') as m:
             for line in m:
                 entry = json.loads(line)
+                audio_key = "audio_filepath" if "audio_filepath" in entry else "audio_file"
+                if audio_key not in entry:
+                    raise KeyError(f"Manifest entry does not contain 'audio_filepath' or  'audio_file' key: {entry}")
+                audio_filepath = entry[audio_key]
+                if not os.path.isfile(audio_filepath) and not os.path.isabs(audio_filepath):
+                    audio_filepath_abs = os.path.join(os.path.dirname(manifest_path), audio_filepath)
+                    if not os.path.isfile(audio_filepath_abs):
+                        raise FileNotFoundError(f"Could not find {audio_filepath} or {audio_filepath_abs}!")
+                    entry[audio_key] = audio_filepath_abs
                 if (config.max_duration is None or entry['duration'] < config.max_duration) and (
                     config.min_duration is None or entry['duration'] >= config.min_duration
                 ):
@@ -648,8 +664,7 @@ def _write_to_tar(self, tar, audio_filepath: str, squashed_filename: str) -> Non
             tar.addfile(ti, encoded_audio)
 
     def _create_shard(self, entries, target_dir, shard_id, manifest_folder):
-        """Creates a tarball containing the audio files from `entries`.
-        """
+        """Creates a tarball containing the audio files from `entries`."""
         if self.config.sort_in_shards:
             entries.sort(key=lambda x: x["duration"], reverse=False)
 
diff --git a/scripts/speech_recognition/estimate_duration_bins.py b/scripts/speech_recognition/estimate_duration_bins.py
index 687c2af59ad2..cca101731772 100644
--- a/scripts/speech_recognition/estimate_duration_bins.py
+++ b/scripts/speech_recognition/estimate_duration_bins.py
@@ -13,6 +13,10 @@
 # limitations under the License.
 
 import argparse
+from itertools import islice
+from pathlib import Path
+
+from lhotse.cut import Cut
 from lhotse.dataset.sampling.dynamic_bucketing import estimate_duration_buckets
 from omegaconf import OmegaConf
 
@@ -23,14 +27,18 @@
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Estimate duration bins for Lhotse dynamic bucketing using a sample of the input dataset. "
-        "The dataset is read either from one or more manifest files and supports data weighting."
+        "The dataset is read either from one or more manifest files and supports data weighting.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument(
         "input",
-        help='Same input format as in model configs under model.train_ds.manifest_filepath. Options: '
-        '1) "path.json"; '
-        '2) "[[path1.json],[path2.json],...]"; '
-        '3) "[[path1.json,weight1],[path2.json,weight2],...]"',
+        help='Data input. Options: '
+        '1) "path.json" - any single NeMo manifest; '
+        '2) "[[path1.json],[path2.json],...]" - any collection of NeMo manifests; '
+        '3) "[[path1.json,weight1],[path2.json,weight2],...]" - any collection of weighted NeMo manifests; '
+        '4) "input_cfg.yaml" - a new option supporting input configs, same as in model training \'input_cfg\' arg; '
+        '5) "path/to/shar_data" - a path to Lhotse Shar data directory; '
+        '6) "key=val" - in case none of the previous variants cover your case: "key" is the key you\'d use in NeMo training config with its corresponding value ',
     )
     parser.add_argument("-b", "--buckets", type=int, default=30, help="The desired number of buckets.")
     parser.add_argument(
@@ -38,7 +46,8 @@ def parse_args():
         "--num_examples",
         type=int,
         default=-1,
-        help="The number of examples (utterances) to estimate the bins. -1 means use all data.",
+        help="The number of examples (utterances) to estimate the bins. -1 means use all data "
+        "(be careful: it could be iterated over infinitely).",
     )
     parser.add_argument(
         "-l",
@@ -62,25 +71,36 @@ def parse_args():
 
 def main():
     args = parse_args()
+    if '=' in args.input:
+        inp_arg = args.input
+    elif args.input.endswith(".yaml"):
+        inp_arg = f"input_cfg={args.input}"
+    elif Path(args.input).is_dir():
+        inp_arg = f"shar_path={args.input}"
+    else:
+        inp_arg = f"manifest_filepath={args.input}"
     config = OmegaConf.merge(
         OmegaConf.structured(LhotseDataLoadingConfig),
-        OmegaConf.from_dotlist([f"manifest_filepath={args.input}", "missing_sampling_rate_ok=true"]),
+        OmegaConf.from_dotlist([inp_arg, "metadata_only=true"]),
     )
     cuts, _ = read_cutset_from_config(config)
     min_dur, max_dur = args.min_duration, args.max_duration
-    discarded, tot = 0, 0
+    nonaudio, discarded, tot = 0, 0, 0
 
     def duration_ok(cut) -> bool:
-        nonlocal discarded, tot
-        ans = min_dur <= cut.duration <= max_dur
-        if not ans:
-            discarded += 1
+        nonlocal nonaudio, discarded, tot
         tot += 1
-        return ans
+        if not isinstance(cut, Cut):
+            nonaudio += 1
+            return False
+        if not (min_dur <= cut.duration <= max_dur):
+            discarded += 1
+            return False
+        return True
 
     cuts = cuts.filter(duration_ok)
     if (N := args.num_examples) > 0:
-        cuts = cuts.subset(first=N)
+        cuts = islice(cuts, N)
     duration_bins = estimate_duration_buckets(cuts, num_buckets=args.buckets)
     duration_bins = f"[{','.join(str(round(b, ndigits=5)) for b in duration_bins)}]"
     if args.quiet:
@@ -89,11 +109,12 @@ def duration_ok(cut) -> bool:
     if discarded:
         ratio = discarded / tot
         print(f"Note: we discarded {discarded}/{tot} ({ratio:.2%}) utterances due to min/max duration filtering.")
+    if nonaudio:
+        print(f"Note: we discarded {nonaudio} non-audio examples found during iteration.")
+    print(f"Used {tot - nonaudio - discarded} examples for the estimation.")
     print("Use the following options in your config:")
     print(f"\tnum_buckets={args.buckets}")
     print(f"\tbucket_duration_bins={duration_bins}")
-    print("Computing utterance duration distribution...")
-    cuts.describe()  # prints a nice table with duration stats + other info
 
 
 if __name__ == "__main__":
diff --git a/tests/collections/__init__.py b/tests/collections/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/collections/asr/confidence/test_asr_confidence.py b/tests/collections/asr/confidence/test_asr_confidence.py
index edf35bb17b0b..015264a9debe 100644
--- a/tests/collections/asr/confidence/test_asr_confidence.py
+++ b/tests/collections/asr/confidence/test_asr_confidence.py
@@ -72,6 +72,7 @@ def audio_and_texts(test_data_dir):
 
 
 class TestASRConfidenceBenchmark:
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
@@ -103,6 +104,7 @@ def test_run_confidence_benchmark(
                 atol=TOL,
             )
 
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/rnnt_alignments_check.py
index aa4d5f044de1..d44f7f8fd985 100644
--- a/tests/collections/asr/decoding/rnnt_alignments_check.py
+++ b/tests/collections/asr/decoding/rnnt_alignments_check.py
@@ -28,13 +28,14 @@
 PRETRAINED_MODEL_NAME = "stt_en_conformer_transducer_small"
 
 
-def get_rnnt_alignments(strategy: str, loop_labels: bool = True, location="cuda"):
+def get_rnnt_alignments(strategy: str, loop_labels: bool = True, use_cuda_graph_decoder=False, location="cuda"):
     cfg = OmegaConf.structured(TranscriptionConfig(pretrained_name=PRETRAINED_MODEL_NAME))
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
     cfg.rnnt_decoding.strategy = strategy
     if cfg.rnnt_decoding.strategy == "greedy_batch":
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
+        cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
     cfg.dataset_manifest = TEST_DATA_PATH
     filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
 
@@ -73,10 +74,15 @@ def cleanup_local_folder():
 # TODO: add the same tests for multi-blank RNNT decoding
 @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
 @pytest.mark.parametrize("loop_labels", [True, False])
-def test_rnnt_alignments(loop_labels: bool):
+@pytest.mark.parametrize("use_cuda_graph_decoder", [True, False])
+def test_rnnt_alignments(loop_labels: bool, use_cuda_graph_decoder: bool):
+    if not loop_labels and use_cuda_graph_decoder:
+        pytest.skip("Frame-Looping algorithm with CUDA graphs does not yet support alignments")
     # using greedy as baseline and comparing all other configurations to it
     ref_transcriptions = get_rnnt_alignments("greedy")
-    transcriptions = get_rnnt_alignments("greedy_batch", loop_labels=loop_labels)
+    transcriptions = get_rnnt_alignments(
+        "greedy_batch", loop_labels=loop_labels, use_cuda_graph_decoder=use_cuda_graph_decoder
+    )
     # comparing that label sequence in alignments is exactly the same
     # we can't compare logits as well, because they are expected to be
     # slightly different in batched and single-sample mode
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index a3a5689062bf..02332f170759 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -26,6 +26,7 @@
     CTCDecoding,
     CTCDecodingConfig,
 )
+from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 
 
@@ -89,7 +90,9 @@ def test_constructor_subword(self, tmp_tokenizer):
         assert decoding is not None
 
     @pytest.mark.unit
-    def test_char_decoding_greedy_forward(self,):
+    def test_char_decoding_greedy_forward(
+        self,
+    ):
         cfg = CTCDecodingConfig(strategy='greedy')
         vocab = char_vocabulary()
         decoding = CTCDecoding(decoding_cfg=cfg, vocabulary=vocab)
@@ -191,3 +194,151 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
                 # timestamps check
                 if timestamps:
                     check_subword_timestamps(hyp, decoding)
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('alignments', [False, True])
+    @pytest.mark.parametrize('timestamps', [False, True])
+    @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "logprobs_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_logprobs(
+        self,
+        tmp_tokenizer,
+        alignments,
+        timestamps,
+        preserve_frame_confidence,
+        length_is_none,
+        logprobs_device,
+        length_device,
+    ):
+        cfg = CTCBPEDecodingConfig(
+            strategy='greedy',
+            preserve_alignments=alignments,
+            compute_timestamps=timestamps,
+            confidence_cfg=ConfidenceConfig(preserve_frame_confidence=preserve_frame_confidence),
+        )
+        unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        cfg.strategy = 'greedy_batch'
+        batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        torch.manual_seed(1)
+        B, T = 4, 20
+        V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
+        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
+        # Set the blank index to a very high probability to make sure
+        # that we always handle at least a few blanks.
+        input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
+        input_signal[:, 1, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+
+        with torch.inference_mode():
+            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+                input_signal, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+                input_signal, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            assert len(hyps) == len(batched_hyps) == B
+            for hyp, batched_hyp in zip(hyps, batched_hyps):
+                assert torch.abs(hyp.score - batched_hyp.score) <= 1e-5
+                assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
+                if timestamps:
+                    assert hyp.timestep == batched_hyp.timestep
+                if alignments:
+                    assert torch.all(hyp.alignments[0] == batched_hyp.alignments[0])
+                    assert torch.all(hyp.alignments[1] == batched_hyp.alignments[1])
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('timestamps', [False, True])
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "labels_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
+        cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
+        unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+        cfg.strategy = 'greedy_batch'
+        batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        torch.manual_seed(1)
+        B, T = 4, 20
+        V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
+        input_labels = torch.randint(V, size=(B, T), device=labels_device)
+        # Set some indices to blank to make sure that we always handle
+        # at least a few blanks.
+        input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
+        input_labels[:, 1] = unbatched_decoding.tokenizer.tokenizer.vocab_size
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+
+        with torch.inference_mode():
+            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+                input_labels, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+                input_labels, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            assert len(hyps) == len(batched_hyps) == B
+            for hyp, batched_hyp in zip(hyps, batched_hyps):
+                assert abs(hyp.score - batched_hyp.score) <= 1e-5
+                assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
+                if timestamps:
+                    assert hyp.timestep == batched_hyp.timestep
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 538ff9d71cf1..31fe822573ce 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -11,19 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import copy
 import glob
-import tempfile
 
 import jiwer
 import pytest
 import torch
-from omegaconf import OmegaConf, open_dict
+from omegaconf import open_dict
 
 from nemo.collections.asr.models import ASRModel
 from nemo.core.utils.cuda_python_utils import skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported
 
 
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_xlarge():
+    model_name = "stt_en_fastconformer_transducer_xlarge"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_xxlarge():
+    model_name = "stt_en_fastconformer_transducer_xxlarge"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_large():
+    model_name = "stt_en_fastconformer_transducer_large"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA decoder can run only on CUDA")
 @pytest.mark.parametrize(
     ("model_name", "batch_size", "enable_bfloat16"),
     [
@@ -42,28 +61,87 @@
     ],
 )
 @pytest.mark.parametrize("loop_labels", [False, True])
-def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, loop_labels: bool):
-    skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, loop_labels: bool, request):
+    if not loop_labels:
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+    if enable_bfloat16 and not torch.cuda.is_bf16_supported():
+        pytest.skip("bfloat16 is not supported")
+
+    device = torch.device("cuda")
+    nemo_model = request.getfixturevalue(model_name).to(device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = loop_labels
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = False
+
+    nemo_model.change_decoding_strategy(decoding_config)
+    audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
+
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
+        actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+
+    decoding_config["greedy"]["use_cuda_graph_decoder"] = True
+
+    nemo_model.change_decoding_strategy(decoding_config)
+
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
+        fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
 
-    conf = ASRModel.from_pretrained(model_name, return_config=True)
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = False
+    wer = jiwer.wer(actual_transcripts, fast_transcripts)
 
-    with tempfile.NamedTemporaryFile() as fp:
-        OmegaConf.save(config=conf, f=fp.name)
-        nemo_model = ASRModel.from_pretrained(model_name, override_config_path=fp.name, map_location="cuda")
+    assert wer <= 1e-3, "Cuda graph greedy decoder should match original decoder implementation."
 
+    for actual, fast in zip(actual_transcripts, fast_transcripts):
+        if actual != fast:
+            print("erroneous samples:")
+            print("Original transcript:", actual)
+            print("New transcript:", fast)
+
+
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA decoder can run only on CUDA")
+@pytest.mark.parametrize("force_mode", ["no_graphs", "no_while_loops", "full_graph"])
+@pytest.mark.parametrize("enable_bfloat16", [False, True])
+def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode(
+    stt_en_fastconformer_transducer_large, force_mode: str, enable_bfloat16: bool
+):
+    """
+    Testing Label-Looping algorithm with CUDA graphs in forced mode.
+    This test guarantees that we check that the fallback behavior is working.
+    NB: Since it is impossible to directly debug CUDA graphs, when making changes,
+    start testing and debugging the code with forced "no_graphs" mode.
+    """
+    if enable_bfloat16 and not torch.cuda.is_bf16_supported():
+        pytest.skip("bfloat16 is not supported")
+
+    if force_mode == "full_graph":
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+
+    batch_size = 16
+    device = torch.device("cuda")
+    nemo_model = stt_en_fastconformer_transducer_large.to(device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = True
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = False
+        # test that alignments and confidence do not introduce failures
+        decoding_config["greedy"]["preserve_alignments"] = True
+        decoding_config["greedy"]["preserve_frame_confidence"] = True
+
+    nemo_model.change_decoding_strategy(decoding_config)
     audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
         actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
 
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = True
-
-    nemo_model.change_decoding_strategy(conf["decoding"])
+    # transcribe with use implementation with cuda graphs
+    decoding_config["greedy"]["use_cuda_graph_decoder"] = True
+    nemo_model.change_decoding_strategy(decoding_config)
+    nemo_model.decoding.decoding._decoding_computer.force_cuda_graphs_mode(mode=force_mode)
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
         fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
@@ -79,27 +157,27 @@ def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16,
             print("New transcript:", fast)
 
 
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available() or torch.cuda.device_count() < 2, reason="Test requires 2 GPUs")
 @pytest.mark.parametrize("loop_labels", [False, True])
-def test_change_devices(loop_labels: bool):
-    skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
-
-    if torch.cuda.device_count() < 2:
-        pytest.skip("Test requires more than 2 GPUs")
+def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarge):
+    if not loop_labels:
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
 
     first_device = torch.device("cuda:0")
     second_device = torch.device("cuda:1")
 
-    model_name = "stt_en_fastconformer_transducer_xlarge"
     batch_size = 8
 
-    conf = ASRModel.from_pretrained(model_name, return_config=True)
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = True
+    nemo_model = stt_en_fastconformer_transducer_xlarge.to(second_device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = loop_labels
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = True
 
-    nemo_model = ASRModel.from_pretrained(model_name, map_location=second_device)
-    nemo_model.change_decoding_strategy(conf["decoding"])
+    nemo_model.change_decoding_strategy(decoding_config)
 
     # Test that the model can run successfully when it is first
     # initialized on second_device and then transferred to
diff --git a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
index 2637e33ebd2a..c4ee4b97a2a6 100644
--- a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
+++ b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
@@ -150,7 +150,7 @@ def test_relmha_adapter_init(self, n_head, proj_dim):
         relpos_enc = adapter_modules.RelPositionalEncodingAdapter(d_model=50)
 
         pad_mask, att_mask = get_mask(lengths)
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             assert adapter.linear_out.weight.sum() == 0
@@ -171,7 +171,7 @@ def test_abspos_encoding_init(self):
 
         relpos_enc = adapter_modules.PositionalEncodingAdapter(d_model=50)
 
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             out, pos_emb = relpos_enc(x)
@@ -187,7 +187,7 @@ def test_relpos_encoding_init(self):
 
         relpos_enc = adapter_modules.RelPositionalEncodingAdapter(d_model=50)
 
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             out, pos_emb = relpos_enc(x)
diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py
index 794213c72397..1a6f38681d0c 100644
--- a/tests/collections/asr/mixins/test_transcription.py
+++ b/tests/collections/asr/mixins/test_transcription.py
@@ -22,6 +22,7 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 
+from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.mixins import TranscribeConfig, TranscriptionMixin
 from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType
@@ -121,6 +122,27 @@ def _transcribe_on_end(self, trcfg: TranscribeConfig):
         self.flag_end = True
 
 
+class DummyDataset(Dataset):
+    def __init__(self, audio_tensors: List[str], config: Dict = None):
+        self.audio_tensors = audio_tensors
+        self.config = config
+
+    def __getitem__(self, index):
+        data = self.audio_tensors[index]
+        samples = torch.tensor(data)
+        # Calculate seq length
+        seq_len = torch.tensor(samples.shape[0], dtype=torch.long)
+
+        # Dummy text tokens
+        text_tokens = torch.tensor([0], dtype=torch.long)
+        text_tokens_len = torch.tensor(1, dtype=torch.long)
+
+        return (samples, seq_len, text_tokens, text_tokens_len)
+
+    def __len__(self):
+        return len(self.audio_tensors)
+
+
 @pytest.fixture()
 def dummy_model():
     return TranscribableDummy()
@@ -326,3 +348,27 @@ def test_transcribe_multiple_tensor(self, test_data_dir):
         assert len(outputs) == 2
         assert isinstance(outputs[0], str)
         assert isinstance(outputs[1], str)
+
+    @pytest.mark.with_downloads()
+    @pytest.mark.unit
+    def test_transcribe_dataloader(self, test_data_dir):
+        model = ASRModel.from_pretrained("stt_en_conformer_ctc_small")
+
+        # Load audio file
+        import soundfile as sf
+
+        audio_file = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an46-mmap-b.wav")
+        audio, sr = sf.read(audio_file, dtype='float32')
+
+        audio_file2 = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an152-mwhw-b.wav")
+        audio2, sr = sf.read(audio_file2, dtype='float32')
+
+        dataset = DummyDataset([audio, audio2])
+        collate_fn = lambda x: _speech_collate_fn(x, pad_id=0)
+        dataloader = DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate_fn)
+
+        # DataLoader test
+        outputs = model.transcribe(dataloader, batch_size=1)
+        assert len(outputs) == 2
+        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[1], str)
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
index 744936263a03..0d7c555ee778 100644
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -269,7 +269,7 @@ def test_vocab_change(self, test_data_dir, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCBPEDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy"
+        assert asr_model.decoding.cfg.strategy == "greedy_batch"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
@@ -309,7 +309,10 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self):
         REMAP_ARGS = {'trim_silence': 'trim', 'labels': 'tokenizer'}
 
         result = assert_dataclass_signature_match(
-            audio_to_text.AudioToBPEDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
+            audio_to_text.AudioToBPEDataset,
+            configs.ASRDatasetConfig,
+            ignore_args=IGNORE_ARGS,
+            remap_args=REMAP_ARGS,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py
index 98d563a4a688..28a07fd54663 100644
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@@ -150,7 +150,7 @@ def test_vocab_change(self, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy"
+        assert asr_model.decoding.cfg.strategy == "greedy_batch"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
@@ -279,7 +279,10 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self):
         REMAP_ARGS = {'trim_silence': 'trim'}
 
         result = assert_dataclass_signature_match(
-            audio_to_text.AudioToCharDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
+            audio_to_text.AudioToCharDataset,
+            configs.ASRDatasetConfig,
+            ignore_args=IGNORE_ARGS,
+            remap_args=REMAP_ARGS,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py
index 946acb614f11..a2e39628e4cb 100644
--- a/tests/collections/asr/test_asr_datasets.py
+++ b/tests/collections/asr/test_asr_datasets.py
@@ -809,6 +809,39 @@ def test_list_to_multichannel(self, num_channels, num_targets):
         # Check the list is converted back to the original signal
         assert (ASRAudioProcessor.list_to_multichannel(target_list) == golden_target).all()
 
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 2])
+    def test_processor_process_audio(self, num_channels):
+        """Test signal normalization in process_audio.
+        """
+        num_samples = 1000
+        num_examples = 30
+
+        signals = ['input_signal', 'target_signal', 'reference_signal']
+
+        for normalization_signal in [None] + signals:
+            # Create processor
+            processor = ASRAudioProcessor(
+                sample_rate=16000, random_offset=False, normalization_signal=normalization_signal
+            )
+
+            # Generate random signals
+            for n in range(num_examples):
+                example = {signal: torch.randn(num_channels, num_samples) for signal in signals}
+                processed_example = processor.process_audio(example)
+
+                # Expected scale
+                if normalization_signal:
+                    scale = 1.0 / (example[normalization_signal].abs().max() + processor.eps)
+                else:
+                    scale = 1.0
+
+                # Make sure all signals are scaled as expected
+                for signal in signals:
+                    assert torch.allclose(
+                        processed_example[signal], example[signal] * scale
+                    ), f'Failed example {n} signal {signal}'
+
     @pytest.mark.unit
     def test_audio_collate_fn(self):
         """Test `_audio_collate_fn`
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
index 55e780c022d8..1743acc6878c 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -64,12 +64,18 @@ def hybrid_asr_model(test_data_dir):
 
     decoder = {
         '_target_': 'nemo.collections.asr.modules.RNNTDecoder',
-        'prednet': {'pred_hidden': model_defaults['pred_hidden'], 'pred_rnn_layers': 1,},
+        'prednet': {
+            'pred_hidden': model_defaults['pred_hidden'],
+            'pred_rnn_layers': 1,
+        },
     }
 
     joint = {
         '_target_': 'nemo.collections.asr.modules.RNNTJoint',
-        'jointnet': {'joint_hidden': 32, 'activation': 'relu',},
+        'jointnet': {
+            'joint_hidden': 32,
+            'activation': 'relu',
+        },
     }
 
     decoding = {'strategy': 'greedy_batch', 'greedy': {'max_symbols': 30}}
@@ -111,7 +117,8 @@ def hybrid_asr_model(test_data_dir):
 
 class TestEncDecHybridRNNTCTCBPEModel:
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -125,7 +132,8 @@ def test_constructor(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_forward(self, hybrid_asr_model):
@@ -160,7 +168,8 @@ def test_forward(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact(self, hybrid_asr_model):
@@ -178,7 +187,8 @@ def test_save_restore_artifact(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact_spe(self, hybrid_asr_model, test_data_dir):
@@ -224,7 +234,8 @@ def test_save_restore_artifact_agg(self, hybrid_asr_model, test_data_dir):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_vocab_change(self, test_data_dir, hybrid_asr_model):
@@ -255,7 +266,8 @@ def test_vocab_change(self, test_data_dir, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_change(self, hybrid_asr_model):
@@ -297,7 +309,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCBPEDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batch"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 
@@ -309,7 +321,8 @@ def test_decoding_change(self, hybrid_asr_model):
         assert hybrid_asr_model.cur_decoder == "ctc"
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_type_change(self, hybrid_asr_model):
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 018c9bcc4aa2..a0d5627f1a65 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -117,7 +117,8 @@ def hybrid_asr_model():
 
 class TestEncDecHybridRNNTCTCModel:
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_constructor(self, hybrid_asr_model):
@@ -129,7 +130,8 @@ def test_constructor(self, hybrid_asr_model):
         assert isinstance(instance2, EncDecHybridRNNTCTCModel)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_forward(self, hybrid_asr_model):
@@ -163,7 +165,8 @@ def test_forward(self, hybrid_asr_model):
         assert diff <= 1e-6
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_vocab_change(self, hybrid_asr_model):
@@ -186,10 +189,12 @@ def test_vocab_change(self, hybrid_asr_model):
         assert hybrid_asr_model.ctc_decoder.vocabulary == hybrid_asr_model.joint.vocabulary
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_change(self, hybrid_asr_model):
@@ -231,7 +236,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batch"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 
@@ -242,7 +247,8 @@ def test_decoding_change(self, hybrid_asr_model):
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is True
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_type_change(self, hybrid_asr_model):
@@ -306,7 +312,8 @@ def test_BeamRNNTInferConfig(self):
         assert dataclass_subset is None
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -349,11 +356,13 @@ def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer],
+        "greedy_class",
+        [greedy_decode.GreedyRNNTInfer],
     )
     def test_greedy_multi_decoding(self, greedy_class):
         token_list = [" ", "a", "b", "c"]
@@ -386,7 +395,8 @@ def test_greedy_multi_decoding(self, greedy_class):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len, partial_hypotheses=partial_hyp)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -430,11 +440,13 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Opti
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer],
+        "greedy_class",
+        [greedy_decode.GreedyRNNTInfer],
     )
     def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
         token_list = [" ", "a", "b", "c"]
@@ -467,7 +479,8 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len, partial_hypotheses=partial_hyp)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -522,7 +535,8 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt
                     assert torch.is_tensor(label)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -556,7 +570,12 @@ def test_beam_decoding(self, beam_config):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        beam = beam_decode.BeamRNNTInfer(decoder, joint_net, beam_size=beam_size, **beam_config,)
+        beam = beam_decode.BeamRNNTInfer(
+            decoder,
+            joint_net,
+            beam_size=beam_size,
+            **beam_config,
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -566,12 +585,16 @@ def test_beam_decoding(self, beam_config):
             _ = beam(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
         "beam_config",
-        [{"search_type": "greedy"}, {"search_type": "default", "score_norm": False, "return_best_hypothesis": False},],
+        [
+            {"search_type": "greedy"},
+            {"search_type": "default", "score_norm": False, "return_best_hypothesis": False},
+        ],
     )
     def test_beam_decoding_preserve_alignments(self, beam_config):
         token_list = [" ", "a", "b", "c"]
@@ -616,7 +639,8 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
                     assert torch.is_tensor(label)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -659,7 +683,8 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optio
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -693,7 +718,12 @@ def test_beam_decoding_SampledRNNTJoint(self, beam_config):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        beam = beam_decode.BeamRNNTInfer(decoder, joint_net, beam_size=beam_size, **beam_config,)
+        beam = beam_decode.BeamRNNTInfer(
+            decoder,
+            joint_net,
+            beam_size=beam_size,
+            **beam_config,
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
diff --git a/tests/collections/asr/test_asr_losses.py b/tests/collections/asr/test_asr_losses.py
index e09fd71e0892..e050e7cc07c3 100644
--- a/tests/collections/asr/test_asr_losses.py
+++ b/tests/collections/asr/test_asr_losses.py
@@ -17,7 +17,9 @@
 import torch
 
 from nemo.collections.asr.losses.audio_losses import (
+    MSELoss,
     SDRLoss,
+    calculate_mse_batch,
     calculate_sdr_batch,
     convolution_invariant_target,
     scale_invariant_target,
@@ -271,7 +273,7 @@ def test_sdr_binary_mask(self, num_channels):
             estimate = target + noise
 
             # Limit calculation to masked samples
-            mask = _rng.integers(low=0, high=2, size=(batch_size, max_num_samples))
+            mask = _rng.integers(low=0, high=2, size=(batch_size, num_channels, max_num_samples))
 
             # Tensors for testing the loss
             tensor_estimate = torch.tensor(estimate)
@@ -282,7 +284,9 @@ def test_sdr_binary_mask(self, num_channels):
             golden_sdr = 0
             for b in range(batch_size):
                 sdr = [
-                    calculate_sdr_numpy(estimate=estimate[b, m, mask[b, :] > 0], target=target[b, m, mask[b, :] > 0])
+                    calculate_sdr_numpy(
+                        estimate=estimate[b, m, mask[b, m, :] > 0], target=target[b, m, mask[b, m, :] > 0]
+                    )
                     for m in range(num_channels)
                 ]
                 sdr = np.mean(np.array(sdr))
@@ -467,3 +471,187 @@ def test_sdr_convolution_invariant(self, num_channels: int, filter_length: int):
             assert np.allclose(
                 uut_sdr_loss.cpu().detach().numpy(), -golden_sdr, atol=atol
             ), f'SDRLoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse(self, num_channels: int, ndim: int):
+        """Test SDR calculation
+        """
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        mse_loss = MSELoss(ndim=ndim)
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.01, high=1) * _rng.normal(size=signal_shape)
+            # Estimate
+            estimate = target + noise
+
+            # DC bias for both
+            target += _rng.uniform(low=-1, high=1)
+            estimate += _rng.uniform(low=-1, high=1)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MSE
+            golden_mse = np.zeros((batch_size, num_channels))
+            for b in range(batch_size):
+                for m in range(num_channels):
+                    err = estimate[b, m, :] - target[b, m, :]
+                    golden_mse[b, m] = np.mean(np.abs(err) ** 2, axis=reduction_dim)
+
+            # Calculate MSE in torch
+            uut_mse = calculate_mse_batch(estimate=tensor_estimate, target=tensor_target)
+
+            # Calculate MSE loss
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare torch SDR vs numpy
+            assert np.allclose(
+                uut_mse.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSE not matching for example {n}'
+
+            # Compare SDR loss vs average of torch SDR
+            assert np.isclose(uut_mse_loss, uut_mse.mean(), atol=atol), f'MSELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse_weighted(self, num_channels: int, ndim: int):
+        """Test SDR calculation with weighting for channels
+        """
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        channel_weight = _rng.uniform(low=0.01, high=1.0, size=num_channels)
+        channel_weight = channel_weight / np.sum(channel_weight)
+        mse_loss = MSELoss(weight=channel_weight, ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MSE
+            golden_mse = 0
+            for b in range(batch_size):
+                mse = [
+                    np.mean(np.abs(estimate[b, m, :] - target[b, m, :]) ** 2, axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                # weighted sum
+                mse = np.sum(np.array(mse) * channel_weight)
+                golden_mse += mse
+            golden_mse /= batch_size  # average over batch
+
+            # Calculate MSE loss
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare
+            assert np.allclose(
+                uut_mse_loss.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse_input_length(self, num_channels: int, ndim: int):
+        """Test SDR calculation with input length.
+        """
+        batch_size = 8
+        max_num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, max_num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, max_num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        mse_loss = MSELoss(ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Limit calculation to random input_length samples
+            input_length = _rng.integers(low=1, high=max_num_samples, size=batch_size)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+            tensor_input_length = torch.tensor(input_length)
+
+            # Reference MSE
+            golden_mse = 0
+            for b, b_len in enumerate(input_length):
+                mse = [
+                    np.mean(np.abs(estimate[b, m, ..., :b_len] - target[b, m, ..., :b_len]) ** 2, axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                mse = np.mean(np.array(mse))
+                golden_mse += mse
+            golden_mse /= batch_size  # average over batch
+
+            # Calculate MSE
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target, input_length=tensor_input_length)
+
+            # Compare
+            assert np.allclose(
+                uut_mse_loss.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSELoss not matching for example {n}'
diff --git a/tests/collections/asr/test_asr_modules.py b/tests/collections/asr/test_asr_modules.py
index b47a72fe0476..1a845232b2a7 100644
--- a/tests/collections/asr/test_asr_modules.py
+++ b/tests/collections/asr/test_asr_modules.py
@@ -69,10 +69,28 @@ def test_AudioToMelSpectrogramPreprocessor_batch(self):
             assert diff <= 1e-3
 
     @pytest.mark.unit
-    def test_SpectrogramAugmentationr(self):
+    def test_SpectrogramAugmentationr_legacy(self):
         # Make sure constructor works
         instance1 = modules.SpectrogramAugmentation(
-            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False, use_vectorized_spec_augment=False
+        )
+        assert isinstance(instance1, modules.SpectrogramAugmentation)
+
+        # Make sure forward doesn't throw with expected input
+        instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0)
+        input_signal = torch.randn(size=(4, 512))
+        length = torch.randint(low=161, high=500, size=[4])
+        res0 = instance0(input_signal=input_signal, length=length)
+        res = instance1(input_spec=res0[0], length=length)
+
+        assert res.shape == res0[0].shape
+
+    @pytest.mark.unit
+    @pytest.mark.run_only_on('GPU')
+    def test_SpectrogramAugmentationr_vectorized(self):
+        # Make sure constructor works
+        instance1 = modules.SpectrogramAugmentation(
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False, use_vectorized_spec_augment=True
         )
         assert isinstance(instance1, modules.SpectrogramAugmentation)
 
@@ -97,7 +115,7 @@ def test_SpectrogramAugmentationr_numba_kernel(self, caplog):
 
         # Make sure constructor works
         instance1 = modules.SpectrogramAugmentation(
-            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True, use_vectorized_spec_augment=False
         )
         assert isinstance(instance1, modules.SpectrogramAugmentation)
 
@@ -120,7 +138,8 @@ def test_SpectrogramAugmentationr_numba_kernel(self, caplog):
     def test_SpectrogramAugmentationr_config(self):
         # Test that dataclass matches signature of module
         result = config_utils.assert_dataclass_signature_match(
-            modules.SpectrogramAugmentation, modules.audio_preprocessing.SpectrogramAugmentationConfig,
+            modules.SpectrogramAugmentation,
+            modules.audio_preprocessing.SpectrogramAugmentationConfig,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
@@ -178,7 +197,8 @@ def test_MaskedPatchAugmentation(self):
     def test_MaskedPatchAugmentation_config(self):
         # Test that dataclass matches signature of module
         result = config_utils.assert_dataclass_signature_match(
-            modules.MaskedPatchAugmentation, modules.audio_preprocessing.MaskedPatchAugmentationConfig,
+            modules.MaskedPatchAugmentation,
+            modules.audio_preprocessing.MaskedPatchAugmentationConfig,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
@@ -195,7 +215,10 @@ def test_RNNTDecoder(self):
         pred_config = OmegaConf.create(
             {
                 '_target_': 'nemo.collections.asr.modules.RNNTDecoder',
-                'prednet': {'pred_hidden': 32, 'pred_rnn_layers': 1,},
+                'prednet': {
+                    'pred_hidden': 32,
+                    'pred_rnn_layers': 1,
+                },
                 'vocab_size': vocab_size,
                 'blank_as_pad': True,
             }
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index d250fbcf74a1..986df09deacb 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -80,9 +80,18 @@ def asr_model(test_data_dir):
         'dir': None,
         'type': 'agg',
         'langs': {
-            'spl_tokens': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"), 'type': 'bpe',},
-            'en': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
-            'de': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
+            'spl_tokens': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"),
+                'type': 'bpe',
+            },
+            'en': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
+            'de': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
         },
         'custom_tokenizer': {
             '_target_': 'nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer',
@@ -98,6 +107,9 @@ def asr_model(test_data_dir):
     modelConfig = DictConfig(
         {
             'prompt_format': 'canary',
+            'prompt_defaults': [
+                {"role": "user", "slots": {"source_lang": "en", "target_lang": "en", "task": "asr", "pnc": "yes"}}
+            ],
             'sample_rate': 16000,
             'preprocessor': DictConfig(preprocessor),
             'model_defaults': DictConfig(model_defaults),
@@ -304,10 +316,9 @@ def test_transcribe_tensor(self, asr_model, test_data_dir):
         audio, sr = sf.read(audio_file, dtype='float32')
 
         # Numpy array test
-        with pytest.raises(NotImplementedError):
-            outputs = asr_model.transcribe(audio, batch_size=1)
-        # assert len(outputs) == 1
-        # assert isinstance(outputs[0], str)
+        outputs = asr_model.transcribe(audio, batch_size=1)
+        assert len(outputs) == 1
+        assert isinstance(outputs[0], str)
 
     @pytest.mark.unit
     def test_build_tokenizer(self, asr_model, test_data_dir):
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index a6e3714f20f5..c3b214751d04 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -432,9 +432,14 @@ def test_BeamRNNTInferConfig(self):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding(self, greedy_class):
+    def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -454,7 +459,14 @@ def test_greedy_decoding(self, greedy_class):
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-            greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+            greedy = greedy_class(
+                decoder,
+                joint_net,
+                blank_index=len(token_list) - 1,
+                max_symbols_per_step=5,
+                **additional_decoding_kwargs,
+            )
 
             # (B, D, T)
             enc_out = torch.randn(1, encoder_output_size, 30)
diff --git a/tests/collections/asr/test_audio_preprocessing.py b/tests/collections/asr/test_audio_preprocessing.py
index b0875936a7f7..600b9fed44fa 100644
--- a/tests/collections/asr/test_audio_preprocessing.py
+++ b/tests/collections/asr/test_audio_preprocessing.py
@@ -155,7 +155,11 @@ def test_spec_to_audio(self, fft_length: int, num_channels: int):
     @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
     @pytest.mark.parametrize('fft_length', [128, 1024])
     @pytest.mark.parametrize('num_channels', [1, 4])
-    def test_audio_to_spectrogram_reconstruction(self, fft_length: int, num_channels: int):
+    @pytest.mark.parametrize('magnitude_power', [0.5, 1, 2])
+    @pytest.mark.parametrize('scale', [0.1, 1.0])
+    def test_audio_to_spectrogram_reconstruction(
+        self, fft_length: int, num_channels: int, magnitude_power: float, scale: float
+    ):
         """Test analysis and synthesis transform result in a perfect reconstruction.
         """
         batch_size = 4
@@ -169,8 +173,12 @@ def test_audio_to_spectrogram_reconstruction(self, fft_length: int, num_channels
         hop_lengths = [fft_length // 2, fft_length // 4]
 
         for hop_length in hop_lengths:
-            audio2spec = AudioToSpectrogram(fft_length=fft_length, hop_length=hop_length)
-            spec2audio = SpectrogramToAudio(fft_length=fft_length, hop_length=hop_length)
+            audio2spec = AudioToSpectrogram(
+                fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale
+            )
+            spec2audio = SpectrogramToAudio(
+                fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale
+            )
 
             for n in range(num_examples):
                 x = _rng.normal(size=(batch_size, num_channels, num_samples))
diff --git a/tests/collections/asr/test_custom_tokenizer.py b/tests/collections/asr/test_custom_tokenizer.py
index 5a033045b709..61692061661f 100644
--- a/tests/collections/asr/test_custom_tokenizer.py
+++ b/tests/collections/asr/test_custom_tokenizer.py
@@ -67,7 +67,9 @@ class DummyModel(ASRBPEMixin, Serialization):
                 "spl_tokens": {"dir": special_tokenizer_path, "type": "bpe"},
                 "en": {"dir": lang_tokenizer_path, "type": "bpe"},
             },
-            "custom_tokenizer": {"_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",},
+            "custom_tokenizer": {
+                "_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",
+            },
         }
     )
     model._setup_aggregate_tokenizer(config)
@@ -83,5 +85,11 @@ class DummyModel(ASRBPEMixin, Serialization):
     assert isinstance(tokenizer.tokenizers_dict["en"], SentencePieceTokenizer)
     assert tokenizer.tokenizers_dict["en"].vocab_size == 6
 
-    assert tokenizer.text_to_ids("<|startoftranscript|>", lang_id="spl_tokens") == [13, 4]  # "_" comes first
+    assert tokenizer.text_to_ids("<|startoftranscript|><|en|><|asr|><|en|><|pnc|>", lang_id="spl_tokens") == [
+        4,
+        9,
+        7,
+        9,
+        5,
+    ]
     assert tokenizer.text_to_ids("a", lang_id="en") == [14 + 1, 14 + 2]
diff --git a/tests/collections/common/prompt_formatters/conftest.py b/tests/collections/common/prompt_formatters/conftest.py
new file mode 100644
index 000000000000..e18f1072af24
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/conftest.py
@@ -0,0 +1,51 @@
+import pytest
+
+from nemo.collections.common.tokenizers import CanaryTokenizer, SentencePieceTokenizer
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
+
+# Note: We don't really define special tokens for this test so every 'special token'
+#       will be represented as a number of regular tokens.
+TOKENIZER_TRAIN_TEXT = """
+Example system message.
+Example user message.
+Example assistant message.
+TEST
+[INST]
+[/INST]
+<s>
+</s>
+<<SYS>>
+<</SYS>>
+User: Assistant:
+user model
+Instruct Output 
+\n\n
+<start_of_turn> <end_of_turn>
+<|
+|>
+<|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
+Feel free to add new tokens for your own tests!?
+But know that if you do so, you may need to update the token IDs in the existing tests! 
+So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
+"""
+
+
+@pytest.fixture(scope="session")
+def bpe_tokenizer(tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("bpe_tokenizer")
+    text_path = tmpdir / "text.txt"
+    text_path.write_text(TOKENIZER_TRAIN_TEXT)
+    create_spt_model(str(text_path), vocab_size=512, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir))
+    return SentencePieceTokenizer(str(tmpdir / "tokenizer.model"))
+
+
+@pytest.fixture(scope="session")
+def canary_tokenizer(bpe_tokenizer, tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("spl_tokens")
+    spl_tokens = CanaryTokenizer.build_special_tokenizer(["transcribe", "en"], tmpdir)
+    return CanaryTokenizer(
+        tokenizers={
+            "spl_tokens": spl_tokens,
+            "en": bpe_tokenizer,
+        }
+    )
diff --git a/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
new file mode 100644
index 000000000000..ff786766b246
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
@@ -0,0 +1,50 @@
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+
+
+def test_canary_prompt_formatter_training(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+            {"role": "assistant", "slots": {"text": "TEST", "prompt_language": "en"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5, 11, 91, 30, 40, 3]
+    assert ans["context_ids"].tolist() == [4, 8, 7, 8, 5]
+    assert ans["answer_ids"].tolist() == [11, 91, 30, 40, 3]
+    assert ans["mask"].tolist() == [False] * 5 + [True] * 5
+    # fmt: on
+
+
+def test_canary_prompt_formatter_inference(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
new file mode 100644
index 000000000000..be1f6de1a873
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
@@ -0,0 +1,40 @@
+from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
+
+
+def test_gemma_prompt_formatter_training(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69,   1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["context_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69]
+    assert ans["answer_ids"].tolist() == [1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["mask"].tolist() == [False] * 36 + [True] * 13
+    # fmt: on
+
+
+def test_gemma_prompt_formatter_inference(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+                                          30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+                                          18,   6,  60,   9,   7,  73,  61,  69]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
new file mode 100644
index 000000000000..9636dd31c768
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
@@ -0,0 +1,63 @@
+from nemo.collections.common.prompts.llama import Llama2PromptFormatter
+
+
+def test_llama2_prompt_formatter_training(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 16 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_training_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 33 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
new file mode 100644
index 000000000000..edc00d426952
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
@@ -0,0 +1,32 @@
+from nemo.collections.common.prompts.mistral import MistralPromptFormatter
+
+
+def test_mistral_prompt_formatter_training(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, 66, 8, 7]
+    assert ans["context_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, 66, 8, 7]
+    assert ans["mask"].tolist() == [False] * 18 + [True] * 7
+    # fmt: on
+
+
+def test_mistral_prompt_formatter_inference(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
new file mode 100644
index 000000000000..26ade7da1415
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
@@ -0,0 +1,147 @@
+import pytest
+
+from nemo.collections.common.prompts.canary import PromptFormatter
+from nemo.collections.common.prompts.formatter import Modality
+
+
+class _DummyPromptFormatter(PromptFormatter):
+    NAME = "_dummy_test_formatter"
+    TEMPLATE = {
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_empty_dialog_exception(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        formatter.encode_dialog([])
+
+
+def test_prompt_formatter_inference(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s>"
+
+
+def test_prompt_formatter_training(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s> hello</s>", recovered
+
+
+def test_prompt_formatter_missing_role(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError, match="A turn must have have a 'role' key"):
+        formatter.encode_dialog([{"slots": {"text": "hi"}}])
+
+
+def test_prompt_formatter_missing_slots(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user"}])
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user", "slots": {}}])
+
+
+def test_prompt_formatter_aggregate_tokenizer(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "text": "hi",
+                    "prompt_language": "en",
+                },
+            }
+        ]
+    )
+    recovered = canary_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == " <s>hi</s>"
+
+
+def test_prompt_formatter_aggregate_tokenizer_missing_prompt_language(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+
+    with pytest.raises(AssertionError, match="Missing key 'prompt_language' in slot_values"):
+        formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+
+
+class _DummyPreamblePromptFormatter(PromptFormatter):
+    NAME = "_dummy_preamble_test_formatter"
+    TEMPLATE = {
+        "preamble": {"template": "TEST"},
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_preamble_inference(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>", recovered
+
+
+def test_prompt_formatter_premble_training(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s> hello</s>"
+
+
+def test_prompt_formatter_explicit_preamble(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "preamble"}, {"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>"
+
+
+def test_prompt_formatter_wrong_preamble_excpetions(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        # Error: 2 preambles
+        formatter.encode_dialog(
+            [
+                {"role": "preamble"},
+                {"role": "preamble"},
+                {"role": "user", "slots": {"text": "hi"}},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble not at the beginning
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble"},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble with slots
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble", "slots": {"abc": "abc"}},
+            ]
+        )
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index d4b3ad03050e..744e2884d015 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -21,9 +21,9 @@
 import numpy as np
 import pytest
 import torch
-from lhotse import CutSet, NumpyFilesWriter, Recording
+from lhotse import CutSet, MonoCut, NumpyFilesWriter, Recording
 from lhotse.audio import AudioLoadingError
-from lhotse.cut import Cut
+from lhotse.cut import Cut, MixedCut
 from lhotse.cut.text import TextPairExample
 from omegaconf import OmegaConf
 
@@ -104,6 +104,51 @@ def nemo_manifest_path(cutset_path: Path):
     return p
 
 
+@pytest.fixture(scope="session")
+def mc_cutset_path(tmp_path_factory) -> Path:
+    """10 two-channel utterances of length 1s as a Lhotse CutSet."""
+    from lhotse import CutSet, MultiCut
+    from lhotse.testing.dummies import DummyManifest
+
+    num_examples = 10  # number of examples
+    num_channels = 2  # number of channels per example
+
+    # create a dummy manifest with single-channel examples
+    sc_cuts = DummyManifest(CutSet, begin_id=0, end_id=num_examples * num_channels, with_data=True)
+    mc_cuts = []
+
+    for n in range(num_examples):
+        # sources for individual channels
+        mc_sources = []
+        for channel in range(num_channels):
+            source = sc_cuts[n * num_channels + channel].recording.sources[0]
+            source.channels = [channel]
+            mc_sources.append(source)
+
+        # merge recordings
+        rec = Recording(
+            sources=mc_sources,
+            id=f'mc-dummy-recording-{n:02d}',
+            num_samples=sc_cuts[0].num_samples,
+            duration=sc_cuts[0].duration,
+            sampling_rate=sc_cuts[0].sampling_rate,
+        )
+
+        # multi-channel cut
+        cut = MultiCut(
+            recording=rec, id=f'mc-dummy-cut-{n:02d}', start=0, duration=1.0, channel=list(range(num_channels))
+        )
+        mc_cuts.append(cut)
+
+    mc_cuts = CutSet.from_cuts(mc_cuts)
+
+    tmp_path = tmp_path_factory.mktemp("data")
+    p = tmp_path / "mc_cuts.jsonl.gz"
+    pa = tmp_path / "mc_audio"
+    mc_cuts.save_audios(pa).to_file(p)
+    return p
+
+
 @pytest.fixture(scope="session")
 def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]:
     """10 utterances of length 1s as a NeMo tarred manifest."""
@@ -247,6 +292,61 @@ def test_dataloader_from_lhotse_cuts_cut_into_windows(cutset_path: Path):
     # exactly 20 cuts were used because we cut 10x 1s cuts into 20x 0.5s cuts
 
 
+def test_dataloader_from_lhotse_cuts_channel_selector(mc_cutset_path: Path):
+    # Dataloader without channel selector
+    config = OmegaConf.create(
+        {
+            "cuts_path": mc_cutset_path,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+    batches = [b for b in dl]
+    assert len(batches) == 3
+
+    # 1.0s = 16000 samples, two channels, note the constant duration and batch size
+    assert batches[0]["audio"].shape == (4, 2, 16000)
+    assert batches[1]["audio"].shape == (4, 2, 16000)
+    assert batches[2]["audio"].shape == (2, 2, 16000)
+    # exactly 10 cuts were used
+
+    # Apply channel selector
+    for channel_selector in [None, 0, 1]:
+
+        config_cs = OmegaConf.create(
+            {
+                "cuts_path": mc_cutset_path,
+                "channel_selector": channel_selector,
+                "sample_rate": 16000,
+                "shuffle": True,
+                "use_lhotse": True,
+                "num_workers": 0,
+                "batch_size": 4,
+                "seed": 0,
+            }
+        )
+
+        dl_cs = get_lhotse_dataloader_from_config(
+            config=config_cs, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+        )
+
+        for n, b_cs in enumerate(dl_cs):
+            if channel_selector is None:
+                # no channel selector, needs to match the original dataset
+                assert torch.equal(b_cs["audio"], batches[n]["audio"])
+            else:
+                # channel selector, needs to match the selected channel
+                assert torch.equal(b_cs["audio"], batches[n]["audio"][:, channel_selector, :])
+
+
 @requires_torchaudio
 def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path):
     config = OmegaConf.create(
@@ -1288,3 +1388,107 @@ def test_multimodal_text_audio_dataloading(
             assert isinstance(ex.target.text, str)
             assert isinstance(ex.source.tokens, np.ndarray)
             assert isinstance(ex.target.tokens, np.ndarray)
+
+
+def test_dataloader_with_noise_nemo_json(cutset_path: Path, nemo_manifest_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": str(nemo_manifest_path),
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_noise_lhotse_jsonl(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": str(cutset_path),
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_path_multi: Path):
+    noise_json, noise_tar = nemo_tarred_manifest_path_multi
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": {"manifest_filepath": noise_json, "tarred_audio_filepaths": noise_tar,},
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_synth_rir(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "rir_enabled": True,
+            "rir_prob": 0.5,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 4
+    cut = batch[0]
+    assert isinstance(cut, MonoCut)
+    assert cut.recording.transforms is None
+    cut = batch[1]
+    assert isinstance(cut, MonoCut)
+    assert cut.recording.transforms is None
+    cut = batch[2]
+    assert isinstance(cut, MonoCut)
+    assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
+    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"
+    cut = batch[3]
+    assert isinstance(cut, MonoCut)
+    assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
+    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"
diff --git a/tests/collections/common/test_optional_cuda_graphs.py b/tests/collections/common/test_optional_cuda_graphs.py
new file mode 100644
index 000000000000..7b1dda775863
--- /dev/null
+++ b/tests/collections/common/test_optional_cuda_graphs.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import SimpleNamespace
+
+import torch.nn as nn
+
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
+
+
+class MockClassWithCudaGraphs(WithOptionalCudaGraphs):
+    def __init__(self):
+        super().__init__()
+        self.cuda_graphs_used = True
+
+    def disable_cuda_graphs(self):
+        self.cuda_graphs_used = False
+
+    def maybe_enable_cuda_graphs(self):
+        self.cuda_graphs_used = True
+
+
+class MockModuleWithCudaGraphs(MockClassWithCudaGraphs, nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class MockModuleWithCudaGraphsByPath(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
+        self.decoding = SimpleNamespace(decoding=MockClassWithCudaGraphs())
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class TestWithOptionalCudaGraphs:
+    def test_module_toggle_cuda_graphs(self):
+        module_with_graphs = MockModuleWithCudaGraphs()
+        assert module_with_graphs.cuda_graphs_used
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(module_with_graphs)
+        assert not module_with_graphs.cuda_graphs_used
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(module_with_graphs)
+        assert module_with_graphs.cuda_graphs_used
+
+    def test_module_toggle_cuda_graphs_by_path(self):
+        module_with_graphs_by_path = MockModuleWithCudaGraphsByPath()
+        assert module_with_graphs_by_path.decoding.decoding.cuda_graphs_used
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(
+            module_with_graphs_by_path, attribute_path="decoding.decoding"
+        )
+        assert not module_with_graphs_by_path.decoding.decoding.cuda_graphs_used
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(
+            module_with_graphs_by_path, attribute_path="decoding.decoding"
+        )
+        assert module_with_graphs_by_path.decoding.decoding.cuda_graphs_used
diff --git a/tests/collections/multimodal/test_speechllm_models.py b/tests/collections/multimodal/test_speechllm_models.py
new file mode 100644
index 000000000000..8698fed205ea
--- /dev/null
+++ b/tests/collections/multimodal/test_speechllm_models.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pytest
+import pytorch_lightning as pl
+import torch
+from megatron.core import parallel_state
+from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+
+from nemo.collections.multimodal.speech_llm.models import modular_models
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class ModularAudioGPTModel(modular_models.ModularAudioGPTModel):
+    # disable logging to avoid MisconfigurationException
+    def log(self, *args, **kwargs):
+        pass
+
+
+def setup_module():
+    pl.seed_everything(1)
+    # init model parallel needed for LLM loss
+    init_method = 'tcp://'
+    master_ip = 'localhost'
+    master_port = '6000'
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(backend='gloo', world_size=1, rank=0, init_method=init_method)
+    parallel_state.initialize_model_parallel(1, 1)
+
+
+@pytest.fixture
+def llm_model_config():
+    this_test_dir = os.path.dirname(os.path.abspath(__file__))
+    # Although most of the stuff in model is loaded from ckpt, we need configs
+    # for e.g. cfg.model.optim
+    config = OmegaConf.load(
+        os.path.join(
+            this_test_dir,
+            "../../../examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml",
+        )
+    )
+    # TODO(zhehuai): move the following to Test /home/TestData
+    config.model.restore_from_path = "/root/home/works/TestData/pretrained_models/megatron_gpt/gpt_pretrain_220m_len_4096_pos_alibi_step_595508_gbs256.nemo"
+    config.model.micro_batch_size = 2
+    config.model.global_batch_size = 2
+    config.model.data.validation_ds.manifest_filepath = (
+        '/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json'
+    )
+    config.model.data.train_ds.manifest_filepath = (
+        '/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json'
+    )
+    return config
+
+
+@pytest.fixture
+def trainer_config():
+    config_trainer = DictConfig({})
+
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+        torch.set_default_device('cuda')
+    else:
+        accelerator = "cpu"
+    config_trainer.accelerator = accelerator
+    config_trainer.devices = 1
+    config_trainer.num_nodes = 1
+    config_trainer.max_epochs = 4
+    config_trainer.max_steps = 1
+    config_trainer.val_check_interval = 1.0
+
+    # for PyTorch Native AMP set precision=16
+    config_trainer.precision = 32
+
+    # setup cluster environment parameters"
+    # use torch elastic cluster environment so `create_process_externally` is True
+    # the launcher is set to None. It will not try to spawn new processes.
+    # It won't create the misconfiguration error because of the `interactive session`
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+
+    strategy = NLPDDPStrategy()
+    plugins = [TorchElasticEnvironment()]
+    trainer = pl.Trainer(logger=False, plugins=plugins, strategy=strategy, **config_trainer)
+    return trainer, config_trainer
+
+
+@pytest.fixture
+def perception_model_config():
+    preprocessor = {"_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor"}
+    encoder = {
+        "_target_": "nemo.collections.asr.modules.ConformerEncoder",
+        "feat_in": 64,
+        "n_layers": 8,
+        "d_model": 64,
+        "self_attention_model": "rel_pos_local_attn",
+        "att_context_size": [128, 128],
+    }
+
+    model_config = DictConfig(
+        {
+            "_target_": "nemo.collections.multimodal.speechllm.modules.speechllm_perception.AudioPerceptionModule",
+            "preprocessor": DictConfig(preprocessor),
+            "encoder": DictConfig(encoder),
+            "modality_adapter": DictConfig(encoder),
+            "output_dim": 1024,
+        }
+    )
+    return model_config
+
+
+@pytest.fixture
+def test_batch():
+    signal_len = torch.from_numpy(np.array([64000, 64000]))
+    transcript = torch.arange(10).reshape(2, 5).int()
+    tokens = transcript[:, :-1]
+    labels = transcript[:, 1:]
+    transcript_length = torch.Tensor([3, 2]).int()
+    # assuming context_lengths = [1, 1]
+    loss_mask = torch.Tensor([[0, 1, 1, 0], [0, 1, 0, 0]])
+    batch = {
+        'audio_signal_length': signal_len,
+        'tokens': tokens,
+        'tokens_length': transcript_length,
+        'contexts': torch.arange(260).reshape(2, 130).int(),
+        'context_lengths': torch.Tensor([1, 1]).int(),
+        'labels': labels,
+        'answers': labels,
+        'loss_mask': loss_mask,
+    }
+    batch['audio_signal'] = torch.randn([2, 64000])
+    return batch
+
+
+@pytest.mark.skip(reason="nedd to move pretrained GPT model to /home/works/TestData first")
+class TestModularAudioGPTModel:
+    @pytest.mark.unit
+    def test_init_and_train(self, llm_model_config, perception_model_config, trainer_config):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+
+        assert isinstance(model.model, GPTModel)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            save_path = str(Path(tmpdir) / "model.nemo")
+            model.train()
+            model.save_to(save_path)
+
+    @pytest.mark.unit
+    def test_prepare_llm_input(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        encoder_input, attention_mask, labels, loss_mask, encoder_length = model.prepare_llm_input(batch)
+        assert encoder_input.shape == (17, 2, 768)
+        assert np.allclose(encoder_input.sum().cpu().detach().numpy(), 15.783691)
+        assert attention_mask.shape == (2, 1, 17, 17)
+        assert labels.shape == (2, 17)
+        assert np.allclose(loss_mask.sum(axis=1).cpu().numpy(), [2, 1])
+        assert np.allclose(encoder_length.cpu().numpy(), (16, 15))
+
+    @pytest.mark.unit
+    def test_training_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.on_train_start()
+        model.setup()
+        model.train()
+        loss_mean = model.training_step(iter([test_batch]), None)
+        assert np.allclose(loss_mean.cpu().detach().numpy(), 5.7052)
+
+    @pytest.mark.unit
+    def test_validation_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        loss_mean = model.validation_step(iter([batch]), 0)
+        assert np.allclose(loss_mean['loss'].cpu().detach().numpy(), 5.7052)
+
+    @pytest.mark.unit
+    def test_predict_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        response = model.predict_step(batch, 0, 0)
+        ground_truth = 'to suit you. Please note these are lecture notes from an alternate presentation. Copyright  ⁇ '
+        assert response['sentences'][0] == ground_truth
+
+    @pytest.mark.unit
+    def test_concat_multi_features(self, llm_model_config, perception_model_config, trainer_config):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.eval()
+
+        feat_dim = 32
+        encoded = [torch.ones([3, 16, feat_dim]), torch.ones([3, 16, feat_dim])]
+        encoded_len = [torch.LongTensor([12, 8, 4]), torch.LongTensor([12, 8, 4])]
+        input_embeds = torch.zeros([2, 32, feat_dim])
+        input_length = torch.LongTensor([32, 28])
+        context_start_idx = [[0, 4, 12, 20], [0, 8, 16, 25]]
+        encoder_input, encoder_length = model._concat_multi_features(
+            encoded, encoded_len, input_embeds, input_length, context_start_idx
+        )
+        assert encoder_input.shape == (2, 56, feat_dim)  # max audio_len + text_len = (12 + 8 + 4) + 32 = 56
+        assert encoder_length.shape == (2,)
+        assert np.allclose(encoder_length.cpu().numpy(), (56, 52))
+        assert encoder_input[0, : context_start_idx[0][1]].sum() == 0  # first 4 features are text features
+        assert np.allclose(
+            encoder_input[0, context_start_idx[0][1] : context_start_idx[0][1] + encoded_len[0][0]],
+            torch.ones([encoded_len[0][0], feat_dim]),
+        )
+
+    @pytest.mark.unit
+    def test_shift_tokens_by_multi_audios(self):
+        """This test is put here because its functionality is similar to _concat_multi_features()"""
+        encoder_max_length = 64
+        audio_len = [torch.LongTensor([12, 8, 4]), torch.LongTensor([12, 8, 4])]
+        context_tokens = torch.ones([2, 32])
+        context_length = torch.LongTensor([32, 28])
+        context_start_idx = [[0, 4, 12, 20], [0, 8, 16, 25]]
+        new_context_tokens = shift_tokens_by_multi_audios(
+            context_tokens, context_length, audio_len, context_start_idx, encoder_max_length
+        )
+        assert new_context_tokens.shape == (2, 64)
+        assert np.allclose(new_context_tokens[0, : context_start_idx[0][1]], torch.ones([context_start_idx[0][1]]))
+        assert np.allclose(
+            new_context_tokens[0, context_start_idx[0][1] : context_start_idx[0][1] + audio_len[0][0]],
+            torch.zeros([audio_len[0][0]]),
+        )
diff --git a/tests/collections/nlp/test_nlp_exportables.py b/tests/collections/nlp/test_nlp_exportables.py
index f533c4a36dfd..3e44fd4dc2a8 100644
--- a/tests/collections/nlp/test_nlp_exportables.py
+++ b/tests/collections/nlp/test_nlp_exportables.py
@@ -20,9 +20,13 @@
 import torch
 import wget
 from omegaconf import DictConfig, OmegaConf
+
+# WAR for https://github.com/pytorch/pytorch/issues/125462
+# Has to be applied before first import of NeMo
 from nemo.core.classes import typecheck
 
 typecheck.enable_wrapping(enabled=False)
+
 from nemo.collections import nlp as nemo_nlp
 from nemo.collections.nlp.models import IntentSlotClassificationModel
 from nemo.collections.nlp.modules.common import (
@@ -37,7 +41,7 @@ def classifier_export(obj):
     with tempfile.TemporaryDirectory() as tmpdir:
         filename = os.path.join(tmpdir, obj.__class__.__name__ + '.onnx')
         obj = obj.cuda()
-        obj.export(output=filename)
+        obj.export(output=filename, use_dynamo=True)
 
 
 class TestExportableClassifiers:
@@ -223,7 +227,7 @@ def test_QAModel_export_to_onnx(self):
         model = nemo_nlp.models.QAModel.from_pretrained(model_name="qa_squadv2.0_bertbase")
         with tempfile.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, 'qa.onnx')
-            model.export(output=filename, check_trace=True)
+            model.export(output=filename, check_trace=True, use_dynamo=True)
             onnx_model = onnx.load(filename)
             assert onnx_model.graph.input[0].name == 'input_ids'
             assert onnx_model.graph.input[1].name == 'attention_mask'
diff --git a/tests/collections/nlp/test_qlora.py b/tests/collections/nlp/test_qlora.py
new file mode 100644
index 000000000000..bc00cc20c6ca
--- /dev/null
+++ b/tests/collections/nlp/test_qlora.py
@@ -0,0 +1,77 @@
+import pytest
+import torch
+from torch import nn
+
+from nemo.collections.nlp.modules.common.megatron.adapters.qlora import NF4LayerNormLinearWrapper, NF4LinearWrapper
+
+ao = pytest.importorskip("torchao.dtypes.nf4tensor", reason="torchao is not installed, skipping qlora tests")
+
+
+@pytest.fixture
+def input_tensor():
+    return torch.randn([8, 4096], dtype=torch.bfloat16, device='cuda') / 10
+
+
+@pytest.fixture
+def original_weight():
+    return torch.randn([1024, 4096], dtype=torch.bfloat16) / 10
+
+
+@pytest.fixture
+def norm_weight():
+    return torch.randn([4096], dtype=torch.bfloat16, device='cuda') / 100
+
+
+@pytest.fixture
+def norm_bias():
+    return torch.randn([4096], dtype=torch.bfloat16, device='cuda') / 100
+
+
+@pytest.fixture
+def ao_nf4_weight(original_weight):
+    return ao.NF4Tensor.from_tensor(original_weight.cuda(), 64, 256)
+
+
+@torch.no_grad()
+def test_nf4_linear(input_tensor, original_weight, ao_nf4_weight):
+
+    nemo_nf4_linear = NF4LinearWrapper(original_weight)
+    assert nemo_nf4_linear.weight.is_nf4_quantized
+    nemo_output, _ = nemo_nf4_linear(input_tensor)
+
+    ao_output = ao.linear_nf4(input_tensor, ao_nf4_weight)
+
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)
+
+
+# @torch.no_grad()
+def test_nf4_layernorm_linear(input_tensor, original_weight, norm_weight, norm_bias, ao_nf4_weight):
+    ln = nn.LayerNorm(input_tensor.size(-1))
+    ln.weight = nn.Parameter(norm_weight)
+    ln.bias = nn.Parameter(norm_bias)
+
+    nemo_nf4_layernorm_linear = NF4LayerNormLinearWrapper(original_weight, norm_weight, norm_bias, "LayerNorm", False)
+    assert nemo_nf4_layernorm_linear.weight.is_nf4_quantized
+    (nemo_output, nemo_norm_output), _ = nemo_nf4_layernorm_linear(input_tensor)
+
+    ao_norm_output = ln(input_tensor)
+    ao_output = ao.linear_nf4(ln(input_tensor), ao_nf4_weight)
+    assert torch.allclose(nemo_norm_output, ao_norm_output, atol=1e-2)
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)
+
+
+@torch.no_grad()
+def test_nf4_rmsnorm_linear(input_tensor, original_weight, norm_weight, norm_bias, ao_nf4_weight):
+    from nemo.utils.export_utils import TorchRMSNorm
+
+    rms_norm = TorchRMSNorm(norm_weight)
+
+    nemo_nf4_layernorm_linear = NF4LayerNormLinearWrapper(original_weight, norm_weight, None, "RMSNorm", False)
+    assert nemo_nf4_layernorm_linear.weight.is_nf4_quantized
+    (nemo_output, nemo_norm_output), _ = nemo_nf4_layernorm_linear(input_tensor)
+
+    ao_norm_output = rms_norm(input_tensor)
+    ao_output = ao.linear_nf4(ao_norm_output, ao_nf4_weight)
+
+    assert torch.allclose(nemo_norm_output, ao_norm_output, atol=1e-2)
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)
diff --git a/tests/fixtures/tts.py b/tests/collections/tts/conftest.py
similarity index 100%
rename from tests/fixtures/tts.py
rename to tests/collections/tts/conftest.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 5069890e4840..6298ed051c68 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,8 +25,6 @@
 
 import pytest
 
-from tests.fixtures.tts import *
-
 # Those variables probably should go to main NeMo configuration file (config.yaml).
 __TEST_DATA_FILENAME = "test_data.tar.gz"
 __TEST_DATA_URL = "https://github.com/NVIDIA/NeMo/releases/download/v1.0.0rc1/"
@@ -68,7 +66,7 @@ def pytest_addoption(parser):
 
 @pytest.fixture
 def device(request):
-    """ Simple fixture returning string denoting the device [CPU | GPU] """
+    """Simple fixture returning string denoting the device [CPU | GPU]"""
     if request.config.getoption("--cpu"):
         return "CPU"
     else:
@@ -193,13 +191,16 @@ def pytest_configure(config):
     If file absent or sizes not equal, function downloads the archive from github and unpacks it.
     """
     config.addinivalue_line(
-        "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
+        "markers",
+        "run_only_on(device): runs the test only on a given device [CPU | GPU]",
     )
     config.addinivalue_line(
-        "markers", "with_downloads: runs the test using data present in tests/.data",
+        "markers",
+        "with_downloads: runs the test using data present in tests/.data",
     )
     config.addinivalue_line(
-        "markers", "nightly: runs the nightly test for QA.",
+        "markers",
+        "nightly: runs the nightly test for QA.",
     )
     # Test dir and archive filepath.
     test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR)
diff --git a/tests/core/test_dist_ckpt.py b/tests/core/test_dist_ckpt.py
new file mode 100644
index 000000000000..8fe21a316854
--- /dev/null
+++ b/tests/core/test_dist_ckpt.py
@@ -0,0 +1,179 @@
+import os
+import types
+from pathlib import Path
+from typing import Any, Dict
+
+import pytest
+import pytorch_lightning as pl
+import torch
+from lightning_fabric.plugins import TorchCheckpointIO
+from pytorch_lightning.demos.boring_classes import BoringModel
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils.callbacks.dist_ckpt_io import (
+    AsyncFinalizableCheckpointIO,
+    AsyncFinalizerCallback,
+    DistributedCheckpointIO,
+)
+
+try:
+    from megatron.core.dist_checkpointing import ShardedTensor
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+
+class ExampleModel(BoringModel):
+    def on_validation_epoch_end(self) -> None:
+        self.log("val_loss", torch.tensor(1.0))
+
+
+class ExampleMCoreModel(ExampleModel):
+    def sharded_state_dict(self):
+        return {
+            'a': ShardedTensor.from_rank_offsets('a', self.layer.weight, replica_id=torch.distributed.get_rank()),
+            'const': 3,
+        }
+
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+
+
+class MockDistributedCheckpointIO(DistributedCheckpointIO):
+    def __init__(self, save_ckpt_format):
+        super().__init__(save_ckpt_format)
+        self.save_checkpoint_called_args = None
+
+    def save_checkpoint(self, *args, **kwargs) -> None:
+        self.save_checkpoint_called_args = args, kwargs
+
+
+class MockTorchCheckpointIO(TorchCheckpointIO):
+    def __init__(self):
+        super().__init__()
+        self.save_checkpoint_called_args = None
+
+    def save_checkpoint(self, *args, **kwargs) -> None:
+        self.save_checkpoint_called_args = args, kwargs
+
+
+def _get_last_checkpoint_dir(root_dir: Path, model: pl.LightningModule, suffix: str = '') -> Path:
+    steps = len(model.train_dataloader().dataset) * model.trainer.max_epochs // torch.distributed.get_world_size()
+    return root_dir / 'checkpoints' / f'epoch={model.trainer.max_epochs - 1}-step={steps}{suffix}'
+
+
+def _get_nlp_strategy_without_optimizer_state():
+    strategy = NLPDDPStrategy()
+    # this ensures optimizer sharded state creation is skipped
+    strategy.optimizer_sharded_state_dict = types.MethodType(
+        lambda self, unsharded_optim_state: unsharded_optim_state, strategy
+    )
+    return strategy
+
+
+class TestDistCkptIO:
+    @pytest.mark.run_only_on('GPU')
+    def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
+        strategy = _get_nlp_strategy_without_optimizer_state()
+        checkpoint_io = MockDistributedCheckpointIO('xxx')
+
+        test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=2,
+            strategy=strategy,
+            plugins=[checkpoint_io],
+            default_root_dir=tmp_path,
+        )
+        model = ExampleMCoreModel()
+        test_trainer.fit(model)
+
+        assert isinstance(test_trainer.strategy.checkpoint_io, MockDistributedCheckpointIO)
+        assert checkpoint_io.save_checkpoint_called_args is not None
+        (state_dict, path), _ = checkpoint_io.save_checkpoint_called_args
+        # Ckpt path doesn't contain the .ckpt suffix
+        assert path.name == _get_last_checkpoint_dir(tmp_path, model).name
+
+    @pytest.mark.run_only_on('GPU')
+    def test_dist_ckpt_path_not_executed_for_non_core_models(self, tmp_path):
+        strategy = NLPDDPStrategy()
+        checkpoint_io = MockTorchCheckpointIO()
+
+        test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=2,
+            strategy=strategy,
+            plugins=[checkpoint_io],
+            default_root_dir=tmp_path,
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert isinstance(test_trainer.strategy.checkpoint_io, MockTorchCheckpointIO)
+        if test_trainer.is_global_zero:
+            assert checkpoint_io.save_checkpoint_called_args is not None
+            (state_dict, path), _ = checkpoint_io.save_checkpoint_called_args
+            # Ckpt path *does* contain the .ckpt suffix
+            assert os.path.basename(path) == _get_last_checkpoint_dir(tmp_path, model, suffix='.ckpt').name
+        else:
+            assert checkpoint_io.save_checkpoint_called_args is None
+
+
+class TestAsyncSave:
+    @pytest.mark.run_only_on('GPU')
+    def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path):
+        strategy = _get_nlp_strategy_without_optimizer_state()
+        sync_checkpoint_io = DistributedCheckpointIO('torch_dist')
+        async_checkpoint_io = AsyncFinalizableCheckpointIO(DistributedCheckpointIO('torch_dist', async_save=True))
+
+        model = ExampleMCoreModel()
+
+        # dummy_trainer just to initialize NCCL
+        dummy_trainer = pl.Trainer(
+            enable_checkpointing=False,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[sync_checkpoint_io],
+        )
+        dummy_trainer.fit(model)
+        tmp_path = strategy.broadcast(tmp_path)
+
+        sync_ckpt_dir = tmp_path / 'sync_checkpoints'
+        async_ckpt_dir = tmp_path / 'async_checkpoints'
+
+        sync_test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[sync_checkpoint_io],
+            default_root_dir=sync_ckpt_dir,
+        )
+        sync_test_trainer.fit(model)
+
+        async_test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[async_checkpoint_io],
+            callbacks=AsyncFinalizerCallback(),
+            default_root_dir=async_ckpt_dir,
+        )
+        async_test_trainer.fit(model)
+
+        # Load and compare checkpoints
+        checkpoint = {'sharded_state_dict': model.sharded_state_dict()}
+        sync_state_dict = sync_checkpoint_io.load_checkpoint(
+            _get_last_checkpoint_dir(sync_ckpt_dir, model), sharded_state_dict=checkpoint
+        )
+        async_state_dict = async_checkpoint_io.load_checkpoint(
+            _get_last_checkpoint_dir(async_ckpt_dir, model), sharded_state_dict=checkpoint
+        )
+
+        assert sync_state_dict['sharded_state_dict']['const'] == async_state_dict['sharded_state_dict']['const']
+        assert torch.all(sync_state_dict['sharded_state_dict']['a'] == async_state_dict['sharded_state_dict']['a'])
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 40fd545ab52d..2d9bd03f0203 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import math
 import os
 import re
@@ -25,6 +26,7 @@
 from pytorch_lightning import Callback
 from pytorch_lightning.loops import _TrainingEpochLoop
 
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
 from nemo.core.classes import ModelPT
 from nemo.utils.callbacks import NeMoModelCheckpoint
@@ -130,6 +132,11 @@ def on_validation_epoch_end(self):
         self.log("val_loss", torch.stack([self.loss]).mean())
 
 
+class ExampleMCoreModel(ExampleModel):
+    def sharded_state_dict(self):
+        return {'a': 3}
+
+
 class DoNothingModel(ExampleModel):
     def configure_optimizers(self):
         return DoNothingOptimizer(self.parameters())
@@ -144,7 +151,7 @@ def test_omegaconf(self):
 
     @pytest.mark.unit
     def test_trainer_loggers(self, tmp_path):
-        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
+        """Test that a trainer with logger errors out with a number of arguments. Test that it works with
         create_tensorboard_logger set to False
         """
         test_trainer = pl.Trainer(accelerator='cpu')  # Should create logger and modelcheckpoint
@@ -228,7 +235,7 @@ def test_trainer_neptune_logger(self, tmp_path):
 
     @pytest.mark.unit
     def test_checkpoint_configurations(self):
-        """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
+        """Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
         is error free if only one is asked to do so.
         """
         disable_tb_logger = {"create_tensorboard_logger": False}
@@ -290,7 +297,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
 
     @pytest.mark.unit
     def test_resume(self, tmp_path):
-        """ Tests the resume capabilities of exp_manager"""
+        """Tests the resume capabilities of exp_manager"""
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
 
         # Error because explicit_log_dir does not exist
@@ -421,7 +428,8 @@ def test_nemo_checkpoint_save_best_model_1(self, tmp_path):
     def test_nemo_checkpoint_save_best_model_2(self, tmp_path):
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
         exp_manager(
-            test_trainer, {"explicit_log_dir": str(tmp_path / "test")},
+            test_trainer,
+            {"explicit_log_dir": str(tmp_path / "test")},
         )
         model = ExampleModel()
         test_trainer.fit(model)
@@ -449,6 +457,27 @@ def test_nemo_checkpoint_always_save_nemo(self, tmp_path):
         model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
         assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
 
+    @pytest.mark.unit
+    def test_nemo_checkpoint_doesnt_produce_too_many_nemo_ckpts(self, tmp_path):
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True, "save_top_k": 2},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists()
+        assert (
+            len(list((tmp_path / "test" / "checkpoints").glob("default*.nemo"))) == 1
+        )  # check number of `.nemo` checkpoints
+
+        model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
+        assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
+
     @pytest.mark.unit
     def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path):
         test_trainer = pl.Trainer(
@@ -502,6 +531,63 @@ def test_nemo_checkpoint_restore_model(self, tmp_path):
         test_trainer.fit(model)
         assert math.fabs(float(model(torch.tensor([1.0, 1.0], device=model.device))) - 0.03) < 1e-5
 
+    @pytest.mark.run_only_on('GPU')
+    @pytest.mark.parametrize('test_dist_ckpt', [False, True])
+    def test_base_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
+        """Simulates already existing checkpoints in the ckpt directory and tests non-nemo ckpt versioning"""
+        strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto'
+        test_trainer = pl.Trainer(
+            accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy
+        )
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_nemo_on_train_end": True},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleMCoreModel() if test_dist_ckpt else ExampleModel()
+
+        ckpt_dir = Path(tmp_path / "test" / "checkpoints")
+        assert not ckpt_dir.exists()
+
+        # Fake existing 1st and last checkpoint
+        suffix = '' if test_dist_ckpt else '.ckpt'
+        ckpt_dir.mkdir(parents=True)
+        ckpt_1 = ckpt_dir / f'default--val_loss=0.0000-epoch=1{suffix}'
+        ckpt_2 = ckpt_dir / f'default--val_loss=0.0300-epoch=2{suffix}'
+
+        if test_dist_ckpt:
+            ckpt_1.mkdir()
+            with open(ckpt_1 / 'metadata.json', 'w') as f:
+                json.dump({'sharded_backend': 'xxx'}, f)
+        else:
+            ckpt_1.touch()
+        # don't create 2nd checkpoint
+        ckpt_nemo = ckpt_dir / 'default.nemo'
+        ckpt_nemo.touch()
+
+        # Train
+        test_trainer.fit(model)
+
+        # Check base checkpoint (without versioning)
+        all_checkpoints = [p.name for p in Path(str(tmp_path / "test" / "checkpoints")).glob("*")]
+        assert ckpt_1.exists(), all_checkpoints  # existed before
+        assert ckpt_2.exists(), all_checkpoints
+        assert ckpt_nemo.exists(), all_checkpoints  # existed before
+
+        # Versioned checkpoints
+        def _get_versioned_name(ckpt_name: Path, nemo: bool = False):
+            if test_dist_ckpt and not nemo:
+                # no suffix at all
+                return ckpt_name.with_name(ckpt_name.name + '-v1')
+            return ckpt_name.with_stem(ckpt_name.stem + '-v1')
+
+        assert _get_versioned_name(ckpt_1).exists(), all_checkpoints
+        assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints  # ckpt2 didn't exist before
+        # .nemo checkpoints are not versioned:
+        assert not _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
+
     @pytest.mark.unit
     def test_last_checkpoint_saved(self, tmp_path):
         max_steps = 64
@@ -529,6 +615,7 @@ def train_dataloader(self):
         model_path = checkpoint_dir / "val_loss=0.0300-epoch=1-step=64-last.ckpt"
         last_saved_checkpoint = torch.load(model_path)
         assert max_steps == last_saved_checkpoint['global_step']
+
         # restart training, ensure global step starts correctly
         class AssertCallback(Callback):
             def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
@@ -618,8 +705,7 @@ def test_warning_validation_skipping_when_custom_epoch_loop(self, tmp_path):
         """
         tmp_path = tmp_path / "test_3"
 
-        class CustomLoop(_TrainingEpochLoop):
-            ...
+        class CustomLoop(_TrainingEpochLoop): ...
 
         trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1, val_check_interval=0.33
@@ -696,7 +782,8 @@ def test_skipped_unfinished_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -740,7 +827,8 @@ def test_skipped_unfinished_dist_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -787,13 +875,17 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         # unfinished checkpoint with EMA part, both parts should be removed
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete01-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete01-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         self._write_fake_checkpoint(checkpoints_dir / "incomplete01.ckpt", isdir=False, add_unfinished_marker=True)
 
         # just EMA part - should be removed. NOTE marker path is the same for base part and for EMA part
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete02-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete02-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         (checkpoints_dir / f"incomplete02{NeMoModelCheckpoint.UNFINISHED_CHECKPOINT_SUFFIX}").touch()
 
@@ -801,7 +893,10 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
@@ -846,7 +941,10 @@ def test_incomplete_dist_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index 0c5a9d9e2309..97a06a1f6887 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -81,7 +81,12 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
 
             if nq is not None:
                 trtllm_deployed_output = nq.query_llm(
-                    prompts=[prompt], max_output_token=1, top_k=1, top_p=0, temperature=0.1, task_id=task_ids,
+                    prompts=[prompt],
+                    max_output_token=1,
+                    top_k=1,
+                    top_p=0,
+                    temperature=0.1,
+                    task_id=task_ids,
                 )
                 trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
 
@@ -140,6 +145,7 @@ def run_trt_llm_inference(
     stop_words_list=None,
     test_deployment=False,
     test_data_path=None,
+    save_trt_engine=False,
 ):
     if Path(checkpoint_path).exists():
         if n_gpu > torch.cuda.device_count():
@@ -194,7 +200,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,
@@ -208,12 +214,15 @@ def run_trt_llm_inference(
             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
             use_lora_plugin=use_lora_plugin,
             lora_target_modules=lora_target_modules,
+            max_num_tokens=int(max_input_token * max_batch_size * 0.2),
+            opt_num_tokens=60,
             save_nemo_model_config=True,
         )
 
         if ptuning:
             trt_llm_exporter.add_prompt_table(
-                task_name="0", prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name="0",
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
 
         output = trt_llm_exporter.forward(
@@ -232,7 +241,11 @@ def run_trt_llm_inference(
         nm = None
         output_deployed = ""
         if test_deployment:
-            nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name=model_name, port=8000,)
+            nm = DeployPyTriton(
+                model=trt_llm_exporter,
+                triton_model_name=model_name,
+                port=8000,
+            )
             nm.deploy()
             nm.run()
             nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
@@ -261,12 +274,17 @@ def run_trt_llm_inference(
             result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
             if test_deployment:
                 nm.stop()
-            shutil.rmtree(trt_llm_model_dir)
+
+            if not save_trt_engine:
+                shutil.rmtree(trt_llm_model_dir)
             return result
 
         if test_deployment:
             nm.stop()
-        shutil.rmtree(trt_llm_model_dir)
+
+        if not save_trt_engine:
+            shutil.rmtree(trt_llm_model_dir)
+
         return None, None, None, None, None
     else:
         raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
@@ -284,6 +302,7 @@ def run_existing_checkpoints(
     test_deployment=False,
     stop_words_list=None,
     test_data_path=None,
+    save_trt_engine=False,
 ):
     if n_gpus > torch.cuda.device_count():
         print("Skipping the test due to not enough number of GPUs")
@@ -338,6 +357,7 @@ def run_existing_checkpoints(
         stop_words_list=stop_words_list,
         test_deployment=test_deployment,
         test_data_path=test_data_path,
+        save_trt_engine=save_trt_engine,
     )
 
 
@@ -348,87 +368,146 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_name", type=str, required=True,
+        "--model_name",
+        type=str,
+        required=True,
     )
     parser.add_argument(
-        "--existing_test_models", default=False, action='store_true',
+        "--existing_test_models",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--model_type", type=str, required=False,
+        "--model_type",
+        type=str,
+        required=False,
     )
     parser.add_argument(
-        "--min_gpus", type=int, default=1, required=True,
+        "--min_gpus",
+        type=int,
+        default=1,
+        required=True,
     )
     parser.add_argument(
-        "--max_gpus", type=int,
+        "--max_gpus",
+        type=int,
     )
     parser.add_argument(
-        "--checkpoint_dir", type=str, default="/tmp/nemo_checkpoint/", required=False,
+        "--checkpoint_dir",
+        type=str,
+        default="/tmp/nemo_checkpoint/",
+        required=False,
     )
     parser.add_argument(
-        "--trt_llm_model_dir", type=str,
+        "--trt_llm_model_dir",
+        type=str,
     )
     parser.add_argument(
-        "--max_batch_size", type=int, default=8,
+        "--max_batch_size",
+        type=int,
+        default=8,
     )
     parser.add_argument(
-        "--max_input_token", type=int, default=256,
+        "--max_input_token",
+        type=int,
+        default=256,
     )
     parser.add_argument(
-        "--max_output_token", type=int, default=128,
+        "--max_output_token",
+        type=int,
+        default=128,
     )
     parser.add_argument(
-        "--p_tuning_checkpoint", type=str,
+        "--p_tuning_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--ptuning", default=False, action='store_true',
+        "--ptuning",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--lora_checkpoint", type=str,
+        "--lora_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--lora", default=False, action='store_true',
+        "--lora",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--tp_size", type=int,
+        "--tp_size",
+        type=int,
     )
     parser.add_argument(
-        "--pp_size", type=int,
+        "--pp_size",
+        type=int,
     )
     parser.add_argument(
-        "--top_k", type=int, default=1,
+        "--top_k",
+        type=int,
+        default=1,
     )
     parser.add_argument(
-        "--top_p", type=float, default=0.0,
+        "--top_p",
+        type=float,
+        default=0.0,
     )
     parser.add_argument(
-        "--temperature", type=float, default=1.0,
+        "--temperature",
+        type=float,
+        default=1.0,
     )
     parser.add_argument(
-        "--run_accuracy", default=False, action='store_true',
+        "--run_accuracy",
+        type=str,
+        default="False",
     )
     parser.add_argument("--streaming", default=False, action="store_true")
     parser.add_argument(
-        "--test_deployment", type=str, default="False",
+        "--test_deployment",
+        type=str,
+        default="False",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--debug", default=False, action='store_true',
+        "--ci_upload_test_results_to_cloud",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--ci_upload_test_results_to_cloud", default=False, action='store_true',
+        "--test_data_path",
+        type=str,
+        default=None,
     )
     parser.add_argument(
-        "--test_data_path", type=str, default=None,
+        "--save_trt_engine",
+        type=str,
+        default="False",
     )
 
     return parser.parse_args()
 
 
 def run_inference_tests(args):
-    if args.test_deployment == "False":
+    if args.test_deployment == "True":
+        args.test_deployment = True
+    else:
         args.test_deployment = False
+
+    if args.save_trt_engine == "True":
+        args.save_trt_engine = True
     else:
-        args.test_deployment = True
+        args.save_trt_engine = False
+
+    if args.run_accuracy == "True":
+        args.run_accuracy = True
+    else:
+        args.run_accuracy = False
 
     if args.run_accuracy:
         if args.test_data_path is None:
@@ -453,6 +532,7 @@ def run_inference_tests(args):
                 test_deployment=args.test_deployment,
                 run_accuracy=args.run_accuracy,
                 test_data_path=args.test_data_path,
+                save_trt_engine=args.save_trt_engine,
             )
 
             n_gpus = n_gpus * 2
@@ -487,6 +567,7 @@ def run_inference_tests(args):
                 streaming=args.streaming,
                 test_deployment=args.test_deployment,
                 test_data_path=args.test_data_path,
+                save_trt_engine=args.save_trt_engine,
             )
 
             n_gpus = n_gpus * 2
diff --git a/tests/lightning/io/__init__.py b/tests/lightning/io/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/lightning/io/test_api.py b/tests/lightning/io/test_api.py
new file mode 100644
index 000000000000..9872d0860193
--- /dev/null
+++ b/tests/lightning/io/test_api.py
@@ -0,0 +1,23 @@
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning import io
+
+
+class TestLoad:
+    def test_reload_ckpt(self, tmpdir):
+        trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy())
+        # model = llm.Mistral7BModel()
+        model = llm.GPTModel(
+            llm.GPTConfig(
+                num_layers=2,
+                hidden_size=1024,
+                ffn_hidden_size=4096,
+                num_attention_heads=8,
+            )
+        )
+
+        ckpt = io.TrainerCheckpoint(model, trainer)
+        ckpt.io_dump(tmpdir)
+        loaded = io.load_ckpt(tmpdir)
+
+        assert loaded.model.config.seq_length == ckpt.model.config.seq_length
diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py
new file mode 100644
index 000000000000..824608db6bf0
--- /dev/null
+++ b/tests/lightning/io/test_mixin.py
@@ -0,0 +1,16 @@
+from nemo.lightning import io
+
+
+class DummyClass(io.IOMixin):
+    def __init__(self, a: int, b: int):
+        self.a = a
+        self.b = b
+
+
+class TestIOMixin:
+    def test_reinit(self):
+        dummy = DummyClass(5, 5)
+        copied = io.reinit(dummy)
+        assert copied is not dummy
+        assert copied.a == dummy.a
+        assert copied.b == dummy.b
diff --git a/tests/lightning/io/test_state.py b/tests/lightning/io/test_state.py
new file mode 100644
index 000000000000..f368f3ce02ce
--- /dev/null
+++ b/tests/lightning/io/test_state.py
@@ -0,0 +1,234 @@
+import pytest
+from torch import nn
+
+from nemo.lightning.io.state import StateDictTransform, TransformCTX, state_transform
+
+
+class TestStateDictTransform:
+    """
+    Tests for the StateDictTransform functionality.
+    """
+
+    @pytest.fixture
+    def mock_ctx(self):
+        """
+        Provides a mock transformation context with predefined source and target states.
+
+        Returns
+        -------
+            TransformCTX: A context object with source and target states.
+        """
+        source_state = {
+            'model.layers.0.self_attn.q_proj.weight': 1,
+            'model.layers.0.self_attn.k_proj.weight': 2,
+            'model.layers.0.self_attn.v_proj.weight': 3,
+            'model.layers.1.self_attn.q_proj.weight': 1,
+            'model.layers.1.self_attn.k_proj.weight': 2,
+            'model.layers.1.self_attn.v_proj.weight': 3,
+        }
+        target_state = {
+            "decoder.layers.0.self_attention.linear_qkv.weight": 10,
+            "decoder.layers.1.self_attention.linear_qkv.weight": 10,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    @pytest.fixture
+    def mock_multi_target_ctx(self):
+        """
+        Provides a mock transformation context with a source state that matches the expected source_key
+        and a target state prepared with initial values for the expected target_keys.
+        """
+        source_state = {'model.layers.1.self_attn.q_proj.weight': 1}
+        # Populate target_state with initial placeholder values for keys expected to be matched and updated
+        target_state = {
+            'decoder.layers.1.self_attention.linear_q.weight': 0,
+            'decoder.layers.1.self_attention.linear_k.weight': 0,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    def test_transform_with_multiple_source_keys(self, mock_ctx):
+        """
+        Test transformation when multiple source keys are specified.
+        """
+        transform = StateDictTransform(
+            source_key=(
+                "model.layers.*.self_attn.q_proj.weight",
+                "model.layers.*.self_attn.k_proj.weight",
+                "model.layers.*.self_attn.v_proj.weight",
+            ),
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_wildcard_in_source_keys(self, mock_ctx):
+        """
+        Test transformation using a wildcard pattern in source keys.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.*_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_mapped_source_keys(self, mock_ctx):
+        """
+        Test transformation with a dictionary mapping for source keys.
+        """
+        transform = StateDictTransform(
+            source_key={
+                "k": "model.layers.*.self_attn.k_proj.weight",
+                "q": "model.layers.*.self_attn.q_proj.weight",
+                "v": "model.layers.*.self_attn.v_proj.weight",
+            },
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_variable_arguments(self, mock_ctx):
+        """
+        Test transformation with a wildcard pattern and variable arguments.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.*_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, *args: sum(args),
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_no_matching_source_keys(self, mock_ctx):
+        """
+        Test transformation when no source keys match the pattern.
+        """
+        transform = StateDictTransform(
+            source_key="non.existent.pattern",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, *args: sum(args),
+        )
+        with pytest.raises(ValueError):
+            transform(mock_ctx)
+
+    def test_transform_with_invalid_transform_function(self, mock_ctx):
+        """
+        Test transformation with a transform function that does not match expected signature.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.q_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx: 0,  # Invalid signature
+        )
+        with pytest.raises(ValueError):
+            transform(mock_ctx)
+
+    def test_transform_with_tuple_target_key_and_multiple_outputs(self, mock_multi_target_ctx):
+        """
+        Test transformation where the target_key is a tuple and the transform function
+        returns multiple values that are then unrolled to these target keys.
+        """
+
+        # Define a transformation that splits the input into two parts
+        def split_transform(ctx, x):
+            return x - 1, x + 1
+
+        # Apply the transformation
+        transform = StateDictTransform(
+            source_key="model.layers.1.self_attn.q_proj.weight",
+            target_key=(
+                "decoder.layers.1.self_attention.linear_q.weight",
+                "decoder.layers.1.self_attention.linear_k.weight",
+            ),
+            transform=split_transform,
+        )
+        transform(mock_multi_target_ctx)
+
+        # Check that the target state has been updated correctly
+        assert mock_multi_target_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 0
+        assert mock_multi_target_ctx.target_state["decoder.layers.1.self_attention.linear_k.weight"] == 2
+
+
+class TestStateTransformDecorator:
+    """
+    Tests for the @state_transform decorator functionality.
+    """
+
+    @pytest.fixture
+    def mock_ctx(self):
+        """
+        Provides a mock transformation context with predefined source and target states.
+        """
+        source_state = {
+            'model.layers.1.self_attn.q_proj.weight': 1,
+            'model.layers.1.self_attn.k_proj.weight': 2,
+            'model.layers.1.self_attn.v_proj.weight': 3,
+        }
+        # Pre-populate target_state with initial values or placeholders
+        target_state = {
+            "decoder.layers.1.self_attention.linear_q.weight": 0,
+            "decoder.layers.1.self_attention.linear_k.weight": 0,
+            "decoder.layers.1.self_attention.linear_v.weight": 0,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    def test_single_transform(self, mock_ctx):
+        """
+        Test the @state_transform decorator with a single source and target key.
+        """
+        # Apply the transformation
+        single_transform(mock_ctx)
+        # Verify the target state is updated correctly
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 11
+
+    def test_multiple_outputs_transform(self, mock_ctx):
+        """
+        Test the @state_transform decorator with a single source key and multiple target keys.
+        """
+        # Apply the transformation
+        multiple_outputs_transform(mock_ctx)
+        # Verify the target state is updated correctly for each key
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 2
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_k.weight"] == 1
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_v.weight"] == 3
+
+
+@state_transform(
+    source_key="model.layers.*.self_attn.q_proj.weight", target_key="decoder.layers.1.self_attention.linear_q.weight"
+)
+def single_transform(ctx, x):
+    """
+    A single transformation function that adds 10 to the input value.
+    """
+    return x + 10
+
+
+@state_transform(
+    source_key="model.layers.1.self_attn.*_proj.weight",
+    target_key=(
+        "decoder.layers.1.self_attention.linear_q.weight",
+        "decoder.layers.1.self_attention.linear_k.weight",
+        "decoder.layers.1.self_attention.linear_v.weight",
+    ),
+)
+def multiple_outputs_transform(ctx, *args):
+    """
+    A transformation function that returns multiple values for multiple target keys.
+    """
+    return args
diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py
new file mode 100644
index 000000000000..7acdcc91b486
--- /dev/null
+++ b/tests/lightning/test_data.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_finetuning_module(mock_gpt_sft_dataset) -> None:
+    from nemo.collections.llm.gpt.data import FineTuningDataModule
+
+    dataset_root = 'random_root'
+    datamodule = FineTuningDataModule(
+        dataset_root,
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_dolly_module(mock_gpt_sft_dataset) -> None:
+    from nemo.collections.llm.gpt.data import DollyDataModule
+
+    datamodule = DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_squad_module(mock_gpt_sft_dataset) -> None:
+    from nemo.collections.llm.gpt.data import SquadDataModule
+
+    datamodule = SquadDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+# TODO @chcui fix test for pretrain data module
+# @patch('megatron.core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder')
+# @patch('nemo.lightning.pytorch.trainer.Trainer')
+# def test_pretraining_module(mock_pretraining_dataset_builder, mock_trainer) -> None:
+#     from nemo.collections.llm.gpt.data import PreTrainingDataModule
+#
+#     datamodule = PreTrainingDataModule(
+#         path=Path('random_path'),
+#         seq_length=2048,
+#         micro_batch_size=4,
+#         global_batch_size=8,
+#         seed=1234,
+#     )
+#     mock_trainer.max_steps = 100
+#     mock_trainer.val_check_interval = 5
+#     mock_trainer.limit_val_batches = 10
+#     mock_trainer.limit_test_batches = 10
+#     datamodule.trainer = mock_trainer
+#
+#     datamodule.setup()
+#     datamodule.train_dataloader()
+#     mock_pretraining_dataset_builder.assert_called_once()
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index 06e614d48251..31d20170c0b6 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 
 import pytest
+from megatron.core import parallel_state
 from torch import nn
 
 from nemo import lightning as nl
@@ -24,11 +25,10 @@ def forward(self, x):
 
         return DummyModule()
 
-    # TODO (chcui): Uncomment this test when we merge mixed-precision
-    # @pytest.fixture
-    # def mock_precision_plugin(self, mocker):
-    #     """Fixture to create a mock precision plugin."""
-    #     return nl.MegatronMixedPrecision(precision="bf16-mixed")
+    @pytest.fixture
+    def mock_precision_plugin(self, mocker):
+        """Fixture to create a mock precision plugin."""
+        return nl.MegatronMixedPrecision(precision="bf16-mixed")
 
     @pytest.fixture
     def mock_callbacks(self, mocker):
@@ -64,55 +64,53 @@ def test_init_with_defaults(self, mocker, mock_pipeline):
         assert megatron_parallel.forward_step == mp.default_forward_step
         assert megatron_parallel.loss_reduction is None
 
-    # TODO (chcui): Uncomment this test when we merge mixed-precision
-    # def test_init_with_custom_parameters(
-    #     self,
-    #     mocker,
-    #     mock_pipeline,
-    #     mock_precision_plugin,
-    #     mock_callbacks,
-    #     mock_data_step,
-    #     mock_forward_step,
-    #     mock_loss_reduction
-    # ):
-    #     """Test __init__ with custom parameters."""
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
-    #
-    #     megatron_parallel = mp.MegatronParallel(
-    #         pipeline=mock_pipeline,
-    #         precision_plugin=mock_precision_plugin,
-    #         callbacks=mock_callbacks,
-    #         data_step=mock_data_step,
-    #         forward_step=mock_forward_step,
-    #         loss_reduction=mock_loss_reduction
-    #     )
-    #
-    #     assert megatron_parallel.pipeline == mock_pipeline
-    #     assert megatron_parallel.precision_plugin == mock_precision_plugin
-    #     assert megatron_parallel.callbacks == mock_callbacks
-    #     assert megatron_parallel.data_step == mock_data_step
-    #     assert megatron_parallel.forward_step == mock_forward_step
-    #     assert megatron_parallel.loss_reduction == mock_loss_reduction
-
-    # TODO: Comment-out this test when we merge nemo.io
-    # def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
-    #     """Test __init__ with virtual pipeline model parallel world size."""
-    #     mocker.patch('torch.distributed.get_rank', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_tensor_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=2)
-    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
-    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
-    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
-    #     mocker.patch('nemo_ext.lightning._strategy_lib.init_lightning_module', return_value=mock_pipeline)
-
-    #     megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2)
-
-    #     assert len(megatron_parallel.pipeline) == 2
-    #     assert all(isinstance(mod, nn.Module) for mod in megatron_parallel.pipeline)
-    #     megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
-    #     assert megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank.call_count == 1
+    def test_init_with_custom_parameters(
+        self,
+        mocker,
+        mock_pipeline,
+        mock_precision_plugin,
+        mock_callbacks,
+        mock_data_step,
+        mock_forward_step,
+        mock_loss_reduction,
+    ):
+        """Test __init__ with custom parameters."""
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
+        mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
+
+        megatron_parallel = mp.MegatronParallel(
+            pipeline=mock_pipeline,
+            precision_plugin=mock_precision_plugin,
+            callbacks=mock_callbacks,
+            data_step=mock_data_step,
+            forward_step=mock_forward_step,
+            loss_reduction=mock_loss_reduction,
+        )
+
+        assert megatron_parallel.pipeline == mock_pipeline
+        assert megatron_parallel.precision_plugin == mock_precision_plugin
+        assert megatron_parallel.callbacks == mock_callbacks
+        assert megatron_parallel.data_step == mock_data_step
+        assert megatron_parallel.forward_step == mock_forward_step
+        assert megatron_parallel.loss_reduction == mock_loss_reduction
+
+    def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
+        """Test __init__ with virtual pipeline model parallel world size."""
+        mocker.patch('torch.distributed.get_rank', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_tensor_model_parallel_group', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_group', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=2)
+        mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
+        mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
+        mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
+        mocker.patch('nemo.lightning.io.reinit', return_value=mock_pipeline)
+
+        megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2, cpu=True)
+
+        assert len(megatron_parallel.pipeline) == 2
+        assert all(isinstance(mod, nn.Module) for mod in megatron_parallel.pipeline)
+        parallel_state.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
+        assert parallel_state.set_virtual_pipeline_model_parallel_rank.call_count == 1
 
 
 class TestCallbackConnector:
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
index 96f5f2920bcf..b59930ab023d 100644
--- a/tests/lightning/test_strategy_lib.py
+++ b/tests/lightning/test_strategy_lib.py
@@ -75,7 +75,7 @@ def test_init_model_parallel(mock_mpu, *args):
     )
 
 
-# TODO @chcui uncomment after DataConfig is merged
+# TODO @chcui uncomment after fabric API is merged
 # @patch('nemo.lightning._strategy_lib.DataLoader', return_value=MagicMock())
 # @patch('megatron.core.parallel_state')
 # def test_process_dataloader(mock_mpu, mock_dataloader) -> None:
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 289a2537e2f2..a08ccdaa1634 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -34,8 +34,8 @@
 )
 
 create_hf_model(
-    model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
-    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
+    model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf",
+    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf-tiny"),
     config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
     overwrite=args.overwrite,
 )
diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 50aa60260b35..07d7f6b46539 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -588,7 +588,7 @@
         "id": "U7Eezf_sAVS0"
       },
       "source": [
-        "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
+        "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
         "\n",
         "This is because the `setup_optimization()` method does it for you! You can still update the config manually."
       ]
diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index ae4f43867c8d..73a8ebc29ee3 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -3,9 +3,7 @@
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
-      "name": "Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
+      "provenance": []
     },
     "kernelspec": {
       "name": "python3",
@@ -155,7 +153,7 @@
       "source": [
         "# Model Name\n",
         "\n",
-        "NeMo adheres to strict requirements when naming a model for upload to NGC / Hugging Face Hub. \n",
+        "NeMo adheres to strict requirements when naming a model for upload to NGC / Hugging Face Hub.\n",
         "\n",
         "It is **mandatory** to share the model name across the model card, the NeMo file itself. Otherwise NeMo model from Hugging Face will fail to restore correctly."
       ],
@@ -170,7 +168,7 @@
         "\n",
         "NeMo model names can vary based on domain and purpose. While we attempt to conform to standard guidelines when naming our models, we do not expect the same level of strictness for community contributions.\n",
         "\n",
-        "Here are some common guidelines we encourage (but do not enforce) users to follow : \n",
+        "Here are some common guidelines we encourage (but do not enforce) users to follow :\n",
         "\n",
         "- `Task name`: Usually a short 2-3 character representation of the task that the model performs.\n",
         "  - `stt` = Speech To Text (ASR)\n",
@@ -189,7 +187,7 @@
         "\n",
         "-----\n",
         "\n",
-        "As an example of the following model we will try today : \n",
+        "As an example of the following model we will try today :\n",
         "\n",
         "`{task name}_{language id}_{model identifier}_[OPTIONAL modifiers]` = `stt_en_conformer_ctc_small`"
       ],
@@ -290,6 +288,87 @@
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Uploading a model to the hub\n",
+        "\n",
+        "There are two ways to upload a NeMo model to the Hugging Face hub -\n",
+        "\n",
+        "1) `push_to_hf_hub()`: This is the recommended and automated way to upload NeMo models to the HuggingFace Hub. NeMo will handle all parts of checkpoint and artifact management for you.\n",
+        "\n",
+        "2) Hugging Face Hub API: We provide steps to use the lower level Hugging Face Hub API to manually upload a NeMo checkpoint to the hub."
+      ],
+      "metadata": {
+        "id": "Ij6npcneH5tM"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Automatic Upload\n",
+        "\n",
+        "First, let's take a look at the automated way to upload a checkpoint to the hub *after* creating an empty model.\n",
+        "\n",
+        "One important argument is `pack_nemo_file` which decides whether to upload a single `.nemo` file to the hub, or the unpacked NeMo file (with all the individual components of the NeMo file extracted into the directory)."
+      ],
+      "metadata": {
+        "id": "cANDXRY9ImuW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "repo_id = f'{username}/{MODEL_NAME}'\n",
+        "\n",
+        "model.push_to_hf_hub(repo_id=repo_id, pack_nemo_file=True)"
+      ],
+      "metadata": {
+        "id": "eCn7y9BmJGEe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "----\n",
+        "\n",
+        "Before showing the manual approach, lets remove the uploaded repository first"
+      ],
+      "metadata": {
+        "id": "62W_zIBrM563"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "try:\n",
+        "  api.delete_repo(repo_id=MODEL_NAME, missing_ok=True)\n",
+        "  api.create_repo(repo_id=MODEL_NAME)\n",
+        "  print(\"Successfully created repository !\")\n",
+        "except Exception as e:\n",
+        "  print(\"Repository is possibly already created. Refer to error here - \\n\\n\", e)"
+      ],
+      "metadata": {
+        "id": "Mqu0Aqg2M8ly"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Manual Upload\n",
+        "\n",
+        "Next, let's take a look at how to use the Hugging Face Hub API to upload a NeMo checkpoint to the hub.\n",
+        "\n",
+        "Note: Both ways will get similar results, so we recommend the automated way to make it easier."
+      ],
+      "metadata": {
+        "id": "UJYwaRA9I1Oc"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
@@ -304,7 +383,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "Note two essential names - \n",
+        "Note two essential names -\n",
         "\n",
         "- `hf_model_name`: A string name that is the composite of your `username` and `MODEL_NAME` as set above. This name is used for multiple purposes, so keep track of it.\n",
         "\n",
@@ -346,7 +425,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "## Test if the model works \n",
+        "## Test if the model works\n",
         "\n",
         "Now that we uploaded the model, let's try to use it in NeMo !\n",
         "\n",
@@ -565,7 +644,7 @@
         "To train, fine-tune or play with the model you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo). We recommend you install it after you've installed latest Pytorch version.\n",
         "```\n",
         "pip install nemo_toolkit['all']\n",
-        "``` \n",
+        "```\n",
         "\n",
         "## How to Use this Model\n",
         "\n",
@@ -618,7 +697,7 @@
         "\n",
         "## Performance\n",
         "\n",
-        "<LIST THE SCORES OF THE MODEL - \n",
+        "<LIST THE SCORES OF THE MODEL -\n",
         "      OR\n",
         "USE THE Hugging Face Evaluate LiBRARY TO UPLOAD METRICS>\n",
         "\n",
@@ -626,7 +705,7 @@
         "\n",
         "<DECLARE ANY POTENTIAL LIMITATIONS OF THE MODEL>\n",
         "\n",
-        "Eg: \n",
+        "Eg:\n",
         "Since this model was trained on publicly available speech datasets, the performance of this model might degrade for speech which includes technical terms, or vernacular that the model has not been trained on. The model might also perform worse for accented speech.\n",
         "\n",
         "\n",
@@ -672,7 +751,7 @@
         "    f.write(OmegaConf.to_yaml(config))\n",
         "    f.write(\"\\n---\\n\\n\")\n",
         "    f.write(TEMPLATE)\n",
-        "  "
+        ""
       ],
       "metadata": {
         "id": "0vk5KK4gzpSU"
@@ -744,7 +823,7 @@
       "cell_type": "code",
       "source": [
         "hf_model_name = f'{username}/{MODEL_NAME}'\n",
-        "metric_value = 8.1  # value obtained from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_small \n",
+        "metric_value = 8.1  # value obtained from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_small\n",
         "\n",
         "evaluate.push_to_hub(\n",
         "    model_id=hf_model_name,\n",
@@ -778,6 +857,81 @@
       "metadata": {
         "id": "f3YYa7liO_m3"
       }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Bonus: Uploading a Model and its Model Card automatically\n",
+        "\n",
+        "`push_to_hf_hub()` also supports uploading a model card to Hugging Face Hub in the same step, so here we show an example of this feature."
+      ],
+      "metadata": {
+        "id": "EtIMGjGwN3fa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Use a different repo name (v2)\n",
+        "try:\n",
+        "  api.create_repo(repo_id=MODEL_NAME + \"_v2\")\n",
+        "  print(\"Successfully created repository !\")\n",
+        "except Exception as e:\n",
+        "  print(\"Repository is possibly already created. Refer to error here - \\n\\n\", e)"
+      ],
+      "metadata": {
+        "id": "b1PSkKZVHREc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Add some rows to template with placeholders wrapped by {}\n",
+        "TEMPLATE_2 = TEMPLATE + \"\"\"\n",
+        "## Original Model Name: {model_name}\n",
+        "## Repo ID: {repo_id}\n",
+        "\"\"\"\n",
+        "kwargs = {\"model_name\": \"ABC\", \"repo_id\": \"nvidia/ABC_XYZ\"}\n",
+        "model_card_v2 = model.generate_model_card(template=TEMPLATE_2, template_kwargs=kwargs, type=\"hf\")  # This is a HF ModelCard object"
+      ],
+      "metadata": {
+        "id": "L4yX-ULnO9EO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.push_to_hf_hub(repo_id=hf_model_name + \"_v2\", pack_nemo_file=False, model_card=model_card_v2)"
+      ],
+      "metadata": {
+        "id": "AgZ5zxVwPNWr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"_v2\")"
+      ],
+      "metadata": {
+        "id": "WDgwrr2aQyUS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "unQ-6_2-RUFB"
+      },
+      "execution_count": null,
+      "outputs": []
     }
   ]
 }
\ No newline at end of file
diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb
index addf19f3b236..c56544d9a043 100644
--- a/tutorials/VoiceSwapSample.ipynb
+++ b/tutorials/VoiceSwapSample.ipynb
@@ -146,7 +146,7 @@
                 "files = [Audio_sample]\n",
                 "raw_text = ''\n",
                 "text = ''\n",
-                "for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):\n",
+                "for fname, transcription in zip(files, quartznet.transcribe(audio=files)):\n",
                 "  raw_text = transcription\n",
                 "\n",
                 "# Add capitalization and punctuation\n",
diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
index eb8cd7b11688..9b925adbd777 100644
--- a/tutorials/asr/ASR_Confidence_Estimation.ipynb
+++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -284,7 +284,7 @@
     "            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
     "        ):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
@@ -307,7 +307,7 @@
     "        current_word_line = \"\"\n",
     "        for word, score in zip(transcript_list, confidence_scores):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index bca4585e45cb..7171510f4e0d 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -259,6 +259,7 @@
    "execution_count": null,
    "id": "d34ee0ba",
    "metadata": {
+    "collapsed": true,
     "jupyter": {
      "outputs_hidden": true
     },
@@ -360,7 +361,7 @@
    "source": [
     "## Create a context-biasing list\n",
     "\n",
-    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
     "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
     "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
     "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
@@ -717,6 +718,28 @@
     "The context graph consists of a composition of a prefix tree (Trie) with the CTC transition topology for words and phrases from the context-biasing list. We use a BPE tokenizer from the target ASR model for word segmentation."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55a36a27-919c-4d64-9163-b0b2c9dca15e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install graphviz from source in case of local run (not Google Colab)\n",
+    "# this may take about 5-10 minutes\n",
+    "# make sure that env variables have been set\n",
+    "\n",
+    "if not IN_COLAB:\n",
+    "\n",
+    "    os.environ['DEBIAN_FRONTEND'] = 'noninteractive'\n",
+    "    os.environ['TZ'] = 'Etc/UTC'\n",
+    "\n",
+    "    !echo $DEBIAN_FRONTEND\n",
+    "    !echo $TZ\n",
+    "\n",
+    "    !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -750,23 +773,6 @@
     "context_graph.draw()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1c57878",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# install graphviz from source if you have problems with graph picture\n",
-    "# set instal_graphviz = True\n",
-    "# this may take about 5-10 minutes\n",
-    "\n",
-    "instal_graphviz = False\n",
-    "\n",
-    "if instal_graphviz:\n",
-    "    !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "04a6f4be",
diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb
index 067c007ea3df..709f96d14ba5 100644
--- a/tutorials/asr/ASR_TTS_Tutorial.ipynb
+++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb
@@ -38,7 +38,7 @@
     "### Architecture\n",
     "\n",
     "<img width=\"400px\" height=\"auto\"\n",
-    "     src=\"https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/_images/hybrid_asr_tts_model.png\"\n",
+    "     src=\"https://github.com/NVIDIA/NeMo/blob/stable/docs/source/asr/images/hybrid_asr_tts_model.png?raw=true\"\n",
     "     alt=\"ASR-TTS model architecture\"\n",
     "     style=\"float: right; margin-left: 20px;\">\n",
     "\n",
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index 6354ce10ec6d..9877b983f2a1 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -701,7 +701,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = es_files) [0]"
+    "asr_model.transcribe(audio = es_files) [0]"
    ]
   },
   {
@@ -1173,7 +1173,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = en_files)[0]"
+    "asr_model.transcribe(audio = en_files)[0]"
    ]
   },
   {
@@ -1221,7 +1221,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = es_files)[0]"
+    "asr_model.transcribe(audio = es_files)[0]"
    ]
   },
   {
diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
index 490a4b6c8de7..9522ac0a80e5 100644
--- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
@@ -638,7 +638,7 @@
                 "    ax2.set_ylabel('Preds and Probas')\n",
                 "    \n",
                 "    \n",
-                "ax = plt.subplot(num+1,1,i+2)\n",
+                "ax = plt.subplot(num+1,1,num+1)\n",
                 "S = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=64, fmax=8000)\n",
                 "S_dB = librosa.power_to_db(S, ref=np.max)\n",
                 "librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sample_rate, fmax=8000)\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index 58b719a867fa..438533f0f03a 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -1431,10 +1431,10 @@
                 "# Lets change the scheduler\n",
                 "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
                 "\n",
-                "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
+                "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
                 "optim_sched_cfg.sched.pop('power')\n",
                 "\n",
-                "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
+                "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
                 "optim_sched_cfg.sched.pop('hold_ratio')\n",
                 "\n",
                 "# Set \"min_lr\" to lower value\n",
diff --git a/tutorials/llm/llama-3/README.rst b/tutorials/llm/llama-3/README.rst
new file mode 100755
index 000000000000..473815802e5f
--- /dev/null
+++ b/tutorials/llm/llama-3/README.rst
@@ -0,0 +1,178 @@
+Llama 3 LoRA Fine-Tuning and Deployment with NeMo Framework and NVIDIA NIM
+==========================================================================
+
+`Llama 3 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ is an open source large language model by Meta that delivers state-of-the-art performance on popular industry benchmarks. It has been pretrained on over 15 trillion tokens, and supports an 8K token context length. It is available in two sizes, 8B and 70B, and each size has two variants—base pretrained and instruction tuned.
+
+`Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required.
+
+`NVIDIA NeMo
+Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`__ provides tools to perform LoRA on Llama 3 to fit your use case, which can then be deployed using `NVIDIA NIM <https://www.nvidia.com/en-us/ai/>`__ for optimized inference on NVIDIA GPUs.
+
+.. figure:: ./img/e2e-lora-train-and-deploy.png
+  :width: 1000
+  :alt: Diagram showing the steps for LoRA customization using the NVIDIA NeMo Framework and deployment with NVIDIA NIM. The steps include converting the base model to .nemo format, creating LoRA adapters with NeMo, and then depoying the LoRA adapter with NIM for inference.
+  :align: center
+
+  Figure 1: Steps for LoRA customization using the NVIDIA NeMo Framework and deployment with NVIDIA NIM
+
+
+| NIM supports seamless deployment of multiple LoRA adapters (aka “multi-LoRA”) over the same base model by dynamically loading the adapter weights based on incoming requests at runtime. This provides the flexibility to handle inputs from various tasks or use cases without the need for deploying a unique model for each individual use case. More information on NIM for LLMs can be found it its `documentation <https://docs.nvidia.com/nim/large-language-models latest/introduction.html>`__.
+
+Requirements
+-------------
+
+In order to proceed, ensure that you have met the following requirements:
+
+* System Configuration
+    * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+    * A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
+    * `Additional NIM requirements <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#prerequisites>`_.
+
+* Requested the necessary permission from Hugging Face and Meta to download `Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`_. Then, you can use your Hugging Face `access token <https://huggingface.co/docs/hub/en/security-tokens>`_ to download the model, which we will then convert and customize with NeMo Framework.
+
+* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_, and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_.
+
+
+`Create a LoRA Adapter with NeMo Framework <./llama3-lora-nemofw.ipynb>`__
+--------------------------------------------------------------------------
+
+This notebook shows how to perform LoRA PEFT on **Llama 3 8B Instruct** using `PubMedQA <https://pubmedqa.github.io/>`__ with NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts. You will use the NeMo Framework which is available as a `docker container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`__.
+
+To get started
+^^^^^^^^^^^^^^
+
+1. Run the container using the following command. It assumes that you have the notebook(s) available in the current working directory. If not, mount the appropriate folder to ``/workspace``.
+
+.. code:: bash
+
+   export FW_VERSION=24.05  # Make sure to choose the latest available tag
+
+
+.. code:: bash
+
+   docker run \
+     --gpus all \
+     --shm-size=2g \
+     --net=host \
+     --ulimit memlock=-1 \
+     --rm -it \
+     -v ${PWD}:/workspace \
+     -w /workspace \
+     -v ${PWD}/results:/results \
+     nvcr.io/nvidia/nemo:$FW_VERSION bash
+
+2. From within the container, start the Jupyter lab:
+
+.. code:: bash
+
+   jupyter lab --ip 0.0.0.0 --port=8888 --allow-root
+
+3. Then, navigate to `this notebook <./llama3-lora-nemofw.ipynb>`__.
+
+
+`Deploy Multiple LoRA Inference Adapters with NVIDIA NIM <./llama3-lora-deploy-nim.ipynb>`__
+--------------------------------------------------------------------------------------------
+
+This procedure demonstrates how to deploy multiple LoRA adapters with NVIDIA NIM. NIM supports LoRA adapters in ``.nemo`` (from NeMo Framework), and Hugging Face model formats. You will deploy the PubMedQA LoRA adapter from the first notebook, alongside two previously trained LoRA adapters (`GSM8K <https://github.com/openai/grade-school-math>`__, `SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__) that are available on NVIDIA NGC as examples.
+
+``NOTE``: Although it’s not mandatory to finish the LoRA training and secure the adapter from the preceding notebook (“Creating a LoRA adapter with NeMo Framework”) to proceed with this one, it is advisable. Regardless, you can continue to learn about LoRA deployment with NIM using other adapters that you’ve downloaded from NVIDIA NGC.
+
+
+1. Download the example LoRA adapters.
+
+The following steps assume that you have authenticated with NGC and downloaded the CLI tool, as listed in the Requirements section.
+
+.. code:: bash
+
+   # Set path to your LoRA model store
+   export LOCAL_PEFT_DIRECTORY="$(pwd)/loras"
+
+
+.. code:: bash
+
+   mkdir -p $LOCAL_PEFT_DIRECTORY
+   pushd $LOCAL_PEFT_DIRECTORY
+
+   # downloading NeMo-format loras
+   ngc registry model download-version "nim/meta/llama3-8b-instruct-lora:nemo-math-v1"
+   ngc registry model download-version "nim/meta/llama3-8b-instruct-lora:nemo-squad-v1"
+
+   popd
+   chmod -R 777 $LOCAL_PEFT_DIRECTORY
+
+2. Prepare the LoRA model store
+
+After training is complete, that LoRA model checkpoint will be
+created at
+``./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo``,
+assuming default paths in the first notebook weren’t modified.
+
+To ensure model store is organized as expected, create a folder named
+``llama3-8b-pubmed-qa``, and move your .nemo checkpoint there.
+
+.. code:: bash
+
+   mkdir -p $LOCAL_PEFT_DIRECTORY/llama3-8b-pubmed-qa
+
+   # Ensure the source path is correct
+   cp ./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo $LOCAL_PEFT_DIRECTORY/llama3-8b-pubmed-qa
+
+
+
+The LoRA model store directory should have a structure like so - with the name of the model as a sub-folder that contains the .nemo file.
+
+::
+
+   <$LOCAL_PEFT_DIRECTORY>
+   ├── llama3-8b-instruct-lora_vnemo-math-v1
+   │   └── llama3_8b_math.nemo
+   ├── llama3-8b-instruct-lora_vnemo-squad-v1
+   │   └── llama3_8b_squad.nemo
+   └── llama3-8b-pubmed-qa
+       └── megatron_gpt_peft_lora_tuning.nemo
+
+The last one was just trained on the PubmedQA dataset in the previous
+notebook.
+
+
+3. Set-up NIM
+
+From your host OS environment, start the NIM docker container while mounting the LoRA model store, as follows:
+
+.. code:: bash
+
+   # Set these configurations
+   export NGC_API_KEY=<YOUR_NGC_API_KEY>
+   export NIM_PEFT_REFRESH_INTERVAL=3600  # (in seconds) will check NIM_PEFT_SOURCE for newly added models in this interval
+   export NIM_CACHE_PATH=</path/to/NIM-model-store-cache>  # Model artifacts (in container) are cached in this directory
+
+
+.. code:: bash
+
+   mkdir -p $NIM_CACHE_PATH
+   chmod -R 777 $NIM_CACHE_PATH
+
+   export NIM_PEFT_SOURCE=/home/nvs/loras # Path to LoRA models internal to the container
+   export CONTAINER_NAME=meta-llama3-8b-instruct
+
+   docker run -it --rm --name=$CONTAINER_NAME \
+       --runtime=nvidia \
+       --gpus all \
+       --shm-size=16GB \
+       -e NGC_API_KEY \
+       -e NIM_PEFT_SOURCE \
+       -e NIM_PEFT_REFRESH_INTERVAL \
+       -v $NIM_CACHE_PATH:/opt/nim/.cache \
+       -v $LOCAL_PEFT_DIRECTORY:$NIM_PEFT_SOURCE \
+       -p 8000:8000 \
+       nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
+
+The first time you run the command, it will download the model and cache it in ``$NIM_CACHE_PATH`` so subsequent deployments are even faster. There are several options to configure NIM other than the ones listed above. You can find a full list in `NIM configuration <https://docs.nvidia.com/nim/large-language-models/latest/configuration.html>`__ documentation.
+
+
+4. Start the notebook
+
+From another terminal, follow the same instructions as the previous
+notebook to launch Jupyter Lab, and navigate to `this notebook <./llama3-lora-deploy-nim.ipynb>`__.
+
+You can use the same NeMo Framework docker container which already has Jupyter Lab installed.
\ No newline at end of file
diff --git a/tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png b/tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png
new file mode 100644
index 000000000000..16bb47eed431
Binary files /dev/null and b/tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png differ
diff --git a/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb b/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb
new file mode 100755
index 000000000000..ca09986ffd59
--- /dev/null
+++ b/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb
@@ -0,0 +1,393 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c0e56fcb",
+   "metadata": {},
+   "source": [
+    "# Multi-LoRA inference with NVIDIA NIM\n",
+    "\n",
+    "This is a demonstration of deploying multiple LoRA adapters with NVIDIA NIM. NIM supports LoRA adapters in .nemo (from NeMo Framework), and Hugging Face model formats. \n",
+    "\n",
+    "We will deploy the PubMedQA LoRA adapter from previous notebook, alongside two other previously trained LoRA adapters (GSM8K, SQuAD) that are available on NVIDIA NGC as examples.\n",
+    "\n",
+    "`NOTE`: While it's not necessary to complete the LoRA training and obtain the adapter from the previous notebook (\"Creating a LoRA adapter with NeMo Framework\") to follow along with this one, it is recommended if possible. You can still learn about LoRA deployment with NIM using the other adapters downloaded from NGC."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d95c164c-b7f2-41d8-8ce3-67656f7bee83",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "This notebook includes instructions to send an inference call to NVIDIA NIM using the Python `requests` library."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5fbf9e2-220b-4677-8a5c-68bba94858c8",
+   "metadata": {},
+   "source": [
+    "## Before you begin\n",
+    "Ensure that you satisfy the pre-requisites, and have completed the setup instructions provided in the README associated with this tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "144d8f05-9dad-425a-9ee8-7b54d7554569",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c83ea9c9-3ef4-4911-8bd3-cb9457dba5d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f09747b0",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Check available LoRA models\n",
+    "\n",
+    "Once the NIM server is up and running, check the available models as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4489179d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/models'\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "data = response.json()\n",
+    "\n",
+    "print(json.dumps(data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db8f40b4-7b43-4781-bf95-bf566a843422",
+   "metadata": {},
+   "source": [
+    "This will return all the models available for inference by NIM. In this case, it will return the base model `meta/llama3-8b-instruct`, as well as the LoRA adapters that were provided during NIM deployment - `llama3-8b-pubmed-qa` (if applicable), `llama3-8b-instruct-lora_vnemo-math-v1`, and `llama3-8b-instruct-lora_vnemo-squad-v1`. Note that their names match the folder names where their .nemo files are stored."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "151e8efd",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Multi-LoRA inference\n",
+    "\n",
+    "Inference can be performed by sending POST requests to the `/completions` endpoint.\n",
+    "\n",
+    "A few things to note:\n",
+    "* The `model` parameter in the payload specifies the model that the request will be directed to. This can be the base model `meta/llama3-8b-instruct`, or any of the LoRA models, such as `llama3-8b-pubmed-qa`.\n",
+    "* `max_tokens` parameter specifies the maximum number of tokens to generate. At any point, the cumulative number of input prompt tokens and specified number of output tokens to generate should not exceed the model's maximum context limit. For llama3-8b-instruct, the context length supported is 8192 tokens.\n",
+    "\n",
+    "Following code snippets show how it's possible to send requests belonging to different LoRAs (or tasks). NIM dynamically loads the LoRA adapters and serves the requests. It also internally handles the batching of requests belonging to different LoRAs to allow better performance and more efficient of compute."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49789d64-c07c-43ed-8ace-0167d6daf415",
+   "metadata": {},
+   "source": [
+    "### PubMedQA\n",
+    "\n",
+    "If you have trained the PubMedQA LoRA model and made it available via NIM inference, try sending an example from the test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dfd2083",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "# Example from the PubMedQA test set\n",
+    "prompt=\"BACKGROUND: Sublingual varices have earlier been related to ageing, smoking and cardiovascular disease. The aim of this study was to investigate whether sublingual varices are related to presence of hypertension.\\nMETHODS: In an observational clinical study among 431 dental patients tongue status and blood pressure were documented. Digital photographs of the lateral borders of the tongue for grading of sublingual varices were taken, and blood pressure was measured. Those patients without previous diagnosis of hypertension and with a noted blood pressure \\u2265 140 mmHg and/or \\u2265 90 mmHg at the dental clinic performed complementary home blood pressure during one week. Those with an average home blood pressure \\u2265 135 mmHg and/or \\u2265 85 mmHg were referred to the primary health care centre, where three office blood pressure measurements were taken with one week intervals. Two independent blinded observers studied the photographs of the tongues. Each photograph was graded as none/few (grade 0) or medium/severe (grade 1) presence of sublingual varices. Pearson's Chi-square test, Student's t-test, and multiple regression analysis were applied. Power calculation stipulated a study population of 323 patients.\\nRESULTS: An association between sublingual varices and hypertension was found (OR = 2.25, p<0.002). Mean systolic blood pressure was 123 and 132 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.0001, CI 95 %). Mean diastolic blood pressure was 80 and 83 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.005, CI 95 %). Sublingual varices indicate hypertension with a positive predictive value of 0.5 and a negative predictive value of 0.80.\\nQUESTION: Is there a connection between sublingual varices and hypertension?\\n ### ANSWER (yes|no|maybe): \"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-pubmed-qa\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8292214a-2b53-41dd-97c7-1ed93877bf01",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1877e910-ed46-417a-8b0f-89f13d9bdafb",
+   "metadata": {},
+   "source": [
+    "### Grade School Math (GSM8K dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "256d3771-b6a6-4d0d-89ef-680dbb34e515",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "prompt = '''Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Answer:'''\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-instruct-lora_vnemo-math-v1\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f56d091-ce70-44ea-a705-e350eb4d6e31",
+   "metadata": {},
+   "source": [
+    "### Extractive Question-Answering (SQuAD)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f50aa6e-0b9a-4834-b7d6-51a48f16eea6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "prompt = '''CONTEXT: \"The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.\\nQUESTION: What were the Norman dynasty famous for? ANSWER:'''\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-instruct-lora_vnemo-squad-v1\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b65afd7a",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## (Optional) Testing the accuracy of NIM inference\n",
+    "\n",
+    "If you followed the previous notebook on training a Llama-3-8b-Instruct LoRA adapter using NeMo Framework and evaluated the model accuracy, you can test the same using NIM inference for validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7516c8c7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Ensure that the path to PubMedQA test data is correct\n",
+    "data_test = json.load(open(\"./pubmedqa/data/test_set.json\",'rt'))\n",
+    "\n",
+    "def read_jsonl (fname):\n",
+    "    obj = []\n",
+    "    with open(fname, 'rt') as f:\n",
+    "        st = f.readline()\n",
+    "        while st:\n",
+    "            obj.append(json.loads(st))\n",
+    "            st = f.readline()\n",
+    "    return obj\n",
+    "\n",
+    "prepared_test = read_jsonl(\"./pubmedqa/data/pubmedqa_test.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68511ac9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Send an inference request to the PubMedQA LoRA model\n",
+    "def infer(prompt):\n",
+    "\n",
+    "    url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "    headers = {\n",
+    "        'accept': 'application/json',\n",
+    "        'Content-Type': 'application/json'\n",
+    "    }\n",
+    "\n",
+    "    data = {\n",
+    "        \"model\": \"llama3-8b-pubmed-qa\",\n",
+    "        \"prompt\": prompt,\n",
+    "        \"max_tokens\": 128\n",
+    "    }\n",
+    "\n",
+    "    response = requests.post(url, headers=headers, json=data)\n",
+    "    response_data = response.json()\n",
+    "\n",
+    "    return(response_data[\"choices\"][0][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4f44cd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "results = {}\n",
+    "sample_id = list(data_test.keys())\n",
+    "\n",
+    "for i, key in tqdm(enumerate(sample_id)):\n",
+    "    answer = infer(prepared_test[i]['input'].strip())\n",
+    "    answer = answer.lower()\n",
+    "    if 'yes' in answer:\n",
+    "        results[key] = 'yes'\n",
+    "    elif 'no' in answer:\n",
+    "        results[key] = 'no'\n",
+    "    elif 'maybe' in answer:\n",
+    "        results[key] = 'maybe'\n",
+    "    else:\n",
+    "        print(\"Malformed answer: \", answer)\n",
+    "        results[key] = 'maybe'\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "319f49ba-0b57-486e-977b-06c89466af60",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9942a1d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# dump results\n",
+    "FILENAME=\"pubmedqa-llama-3-8b-lora-NIM.json\"\n",
+    "with(open(FILENAME, \"w\")) as f:\n",
+    "    json.dump(results, f)\n",
+    "\n",
+    "# Evaluation\n",
+    "!cp $FILENAME ./pubmedqa/\n",
+    "!cd ./pubmedqa/ && python evaluation.py $FILENAME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d014d79",
+   "metadata": {},
+   "source": [
+    "NIM inference should provide comparable accuracy to NeMo Framework inference.\n",
+    "\n",
+    "Note that each individual answer also conform to the format we specified, i.e. `<<< {answer} >>>`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb
new file mode 100755
index 000000000000..3244bf18e818
--- /dev/null
+++ b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb
@@ -0,0 +1,595 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d3323204-1463-4df3-8c75-5e95b6d66ba1",
+   "metadata": {},
+   "source": [
+    "# Creating a Llama-3 LoRA adapter with NeMo Framework"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29f3d632-44a0-4e6c-9229-b70bbcff1e99",
+   "metadata": {},
+   "source": [
+    "This notebook showcases performing LoRA PEFT **Llama 3 8B** on [PubMedQA](https://pubmedqa.github.io/) using NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts.\n",
+    "\n",
+    "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. Instructions are available in the associated tutorial README."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50de4d53",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "deb6a910-a05e-4ae1-aac4-56e5092be2b4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "##  Step-by-step instructions\n",
+    "\n",
+    "This notebook is structured into six steps:\n",
+    "1. Download Llama-3-8B-Instruct from Hugging Face\n",
+    "2. Convert Llama-3-8B-Instruct to NeMo format\n",
+    "3. Prepare the dataset\n",
+    "4. Run the PEFT finetuning script\n",
+    "5. Inference with NeMo Framework\n",
+    "6. Check the model accuracy\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1f8f06d-aa9b-49cf-b50b-023967fc9e1a",
+   "metadata": {},
+   "source": [
+    "### Step 1: Download the model from Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5c50597-53e9-4604-9b86-af4c8e6b027e",
+   "metadata": {},
+   "source": [
+    "> `NOTE:` Access to Meta-Llama-3-8B-Instruct is gated. Before you proceed, ensure that you have a Hugging Face account, and have requested the necessary permission from Hugging Face and Meta to download the model on the [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) page. Then, you can use your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens) to download the model in the following code snippet, which we will then convert and customize with NeMo Framework."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f14a2ea5-309b-4f78-8524-313043e9daeb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import huggingface_hub\n",
+    "\n",
+    "# Set your Hugging Face access token\n",
+    "huggingface_hub.login(\"<YOUR_HUGGINGFACE_ACCESS_TOKEN>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99125f50",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "os.makedirs(\"./Meta-Llama-3-8B-Instruct\" ,exist_ok=True)\n",
+    "huggingface_hub.snapshot_download(repo_id=\"meta-llama/Meta-Llama-3-8B-Instruct\", local_dir=\"Meta-Llama-3-8B-Instruct\", local_dir_use_symlinks=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18d5a8a9-41db-4186-a51a-a89d0501e1c0",
+   "metadata": {},
+   "source": [
+    "The Llama-3-8B-Instruct model will be downloaded to `./Meta-Llama-3-8B-Instruct`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49fc4629",
+   "metadata": {},
+   "source": [
+    "### Step 2: Convert Llama-3-8B-Instruct to NeMo format\n",
+    "\n",
+    "Run the below code to convert the model to the NeMo format. \n",
+    "\n",
+    "The generated `.nemo` file uses distributed checkpointing and can be loaded with any Tensor Parallel (TP) or Pipeline Parallel (PP) combination without reshaping or splitting. For more information on parallelisms in NeMo, refer to [NeMo Framework documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55331dd3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# clear any previous temporary weights dir if any\n",
+    "rm -r model_weights\n",
+    "\n",
+    "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \\\n",
+    "  --precision bf16 \\\n",
+    "  --input_name_or_path=./Meta-Llama-3-8B-Instruct/ \\\n",
+    "  --output_path=./Meta-Llama-3-8B-Instruct.nemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fafb86d7-6254-42d4-b9aa-ab8a723f90c1",
+   "metadata": {},
+   "source": [
+    "This will create a .nemo model file in current working directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ea5bd31",
+   "metadata": {},
+   "source": [
+    "### Step 3: Prepare the dataset\n",
+    "\n",
+    "Download the PubMedQA dataset and run the pre-processing script in the cloned directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "944b43c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# Download the dataset and prep. scripts\n",
+    "git clone https://github.com/pubmedqa/pubmedqa.git\n",
+    "\n",
+    "# split it into train/val/test datasets\n",
+    "cd pubmedqa/preprocess\n",
+    "python split_dataset.py pqal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8025b2d4",
+   "metadata": {},
+   "source": [
+    "The following example shows what a single row looks inside of the PubMedQA train, validation and test splits.\n",
+    "\n",
+    "```json\n",
+    "\"18251357\": {\n",
+    "    \"QUESTION\": \"Does histologic chorioamnionitis correspond to clinical chorioamnionitis?\",\n",
+    "    \"CONTEXTS\": [\n",
+    "        \"To evaluate the degree to which histologic chorioamnionitis, a frequent finding in placentas submitted for histopathologic evaluation, correlates with clinical indicators of infection in the mother.\",\n",
+    "        \"A retrospective review was performed on 52 cases with a histologic diagnosis of acute chorioamnionitis from 2,051 deliveries at University Hospital, Newark, from January 2003 to July 2003. Third-trimester placentas without histologic chorioamnionitis (n = 52) served as controls. Cases and controls were selected sequentially. Maternal medical records were reviewed for indicators of maternal infection.\",\n",
+    "        \"Histologic chorioamnionitis was significantly associated with the usage of antibiotics (p = 0.0095) and a higher mean white blood cell count (p = 0.018). The presence of 1 or more clinical indicators was significantly associated with the presence of histologic chorioamnionitis (p = 0.019).\"\n",
+    "    ],\n",
+    "    \"reasoning_required_pred\": \"yes\",\n",
+    "    \"reasoning_free_pred\": \"yes\",\n",
+    "    \"final_decision\": \"yes\",\n",
+    "    \"LONG_ANSWER\": \"Histologic chorioamnionitis is a reliable indicator of infection whether or not it is clinically apparent.\"\n",
+    "},\n",
+    "```\n",
+    "\n",
+    "Use the following code to convert the train, validation, and test PubMedQA data into the `JSONL` format that NeMo needs for PEFT."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90f69729",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "def read_jsonl(fname):\n",
+    "    obj = []\n",
+    "    with open(fname, 'rt') as f:\n",
+    "        st = f.readline()\n",
+    "        while st:\n",
+    "            obj.append(json.loads(st))\n",
+    "            st = f.readline()\n",
+    "    return obj\n",
+    "\n",
+    "def write_jsonl(fname, json_objs):\n",
+    "    with open(fname, 'wt') as f:\n",
+    "        for o in json_objs:\n",
+    "            f.write(json.dumps(o)+\"\\n\")\n",
+    "            \n",
+    "def form_question(obj):\n",
+    "    st = \"\"    \n",
+    "    for i, label in enumerate(obj['LABELS']):\n",
+    "        st += f\"{label}: {obj['CONTEXTS'][i]}\\n\"\n",
+    "    st += f\"QUESTION: {obj['QUESTION']}\\n\"\n",
+    "    st += f\" ### ANSWER (yes|no|maybe): \"\n",
+    "    return st\n",
+    "\n",
+    "def convert_to_jsonl(data_path, output_path):\n",
+    "    data = json.load(open(data_path, 'rt'))\n",
+    "    json_objs = []\n",
+    "    for k in data.keys():\n",
+    "        obj = data[k]\n",
+    "        prompt = form_question(obj)\n",
+    "        completion = obj['final_decision']\n",
+    "        json_objs.append({\"input\": prompt, \"output\": f\"<<< {completion} >>>\"})\n",
+    "    write_jsonl(output_path, json_objs)\n",
+    "    return json_objs\n",
+    "\n",
+    "\n",
+    "test_json_objs = convert_to_jsonl(\"pubmedqa/data/test_set.json\", \"pubmedqa/data/pubmedqa_test.jsonl\")\n",
+    "train_json_objs = convert_to_jsonl(\"pubmedqa/data/pqal_fold0/train_set.json\", \"pubmedqa/data/pubmedqa_train.jsonl\")\n",
+    "dev_json_objs = convert_to_jsonl(\"pubmedqa/data/pqal_fold0/dev_set.json\", \"pubmedqa/data/pubmedqa_val.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62777542",
+   "metadata": {},
+   "source": [
+    "> `Note:` In the output, we enforce the inclusion of “<<<” and “>>>“ markers which would allow verification of the LoRA tuned model during inference. This is  because the base model can produce “yes” / “no” responses based on zero-shot templates as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04a3fc36",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# clear up cached mem-map file\n",
+    "!rm pubmedqa/data/*idx*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ddd0f2a",
+   "metadata": {},
+   "source": [
+    "After running the above script, you will see  `pubmedqa_train.jsonl`, `pubmedqa_val.jsonl`, and `pubmedqa_test.jsonl` files appear in the data directory.\n",
+    "\n",
+    "This is what an example will be formatted like after the script has converted the PubMedQA data into `JSONL` -\n",
+    "\n",
+    "```json\n",
+    "{\"input\": \"QUESTION: Failed IUD insertions in community practice: an under-recognized problem?\\nCONTEXT: The data analysis was conducted to describe the rate of unsuccessful copper T380A intrauterine device (IUD) insertions among women using the IUD for emergency contraception (EC) at community family planning clinics in Utah.\\n ...  ### ANSWER (yes|no|maybe): \",\n",
+    "\"output\": \"<<< yes >>>\"}\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0eb1d887",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Step 4: Run PEFT finetuning script for LoRA\n",
+    "\n",
+    "NeMo framework includes a high level python script for fine-tuning  [megatron_gpt_finetuning.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py) that can abstract away some of the lower level API calls. Once you have your model downloaded and the dataset ready, LoRA fine-tuning with NeMo is essentially just running this script!\n",
+    "\n",
+    "For this demonstration, this training run is capped by `max_steps`, and validation is carried out every `val_check_interval` steps. If the validation loss does not improve after a few checks, training is halted to avoid overfitting.\n",
+    "\n",
+    "> `NOTE:` In the block of code below, pass the paths to your train, test and validation data files as well as path to the .nemo model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2c129f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# Set paths to the model, train, validation and test sets.\n",
+    "MODEL=\"./Meta-Llama-3-8B-Instruct.nemo\"\n",
+    "TRAIN_DS=\"[./pubmedqa/data/pubmedqa_train.jsonl]\"\n",
+    "VALID_DS=\"[./pubmedqa/data/pubmedqa_val.jsonl]\"\n",
+    "TEST_DS=\"[./pubmedqa/data/pubmedqa_test.jsonl]\"\n",
+    "TEST_NAMES=\"[pubmedqa]\"\n",
+    "\n",
+    "SCHEME=\"lora\"\n",
+    "TP_SIZE=1\n",
+    "PP_SIZE=1\n",
+    "\n",
+    "OUTPUT_DIR=\"./results/Meta-Llama-3-8B-Instruct\"\n",
+    "rm -r $OUTPUT_DIR\n",
+    "\n",
+    "torchrun --nproc_per_node=1 \\\n",
+    "/opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \\\n",
+    "    exp_manager.exp_dir=${OUTPUT_DIR} \\\n",
+    "    exp_manager.explicit_log_dir=${OUTPUT_DIR} \\\n",
+    "    trainer.devices=1 \\\n",
+    "    trainer.num_nodes=1 \\\n",
+    "    trainer.precision=bf16-mixed \\\n",
+    "    trainer.val_check_interval=20 \\\n",
+    "    trainer.max_steps=500 \\\n",
+    "    model.megatron_amp_O2=True \\\n",
+    "    ++model.mcore_gpt=True \\\n",
+    "    model.tensor_model_parallel_size=${TP_SIZE} \\\n",
+    "    model.pipeline_model_parallel_size=${PP_SIZE} \\\n",
+    "    model.micro_batch_size=1 \\\n",
+    "    model.global_batch_size=8 \\\n",
+    "    model.restore_from_path=${MODEL} \\\n",
+    "    model.data.train_ds.num_workers=0 \\\n",
+    "    model.data.validation_ds.num_workers=0 \\\n",
+    "    model.data.train_ds.file_names=${TRAIN_DS} \\\n",
+    "    model.data.train_ds.concat_sampling_probabilities=[1.0] \\\n",
+    "    model.data.validation_ds.file_names=${VALID_DS} \\\n",
+    "    model.peft.peft_scheme=${SCHEME}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf4331fd-da30-4e29-8477-3085118e4a7b",
+   "metadata": {},
+   "source": [
+    "This will create a LoRA adapter - a file named `megatron_gpt_peft_lora_tuning.nemo` in `./results/Meta-Llama-3-8B-Instruct/checkpoints/`. We'll use this later.\n",
+    "\n",
+    "To further configure the run above -\n",
+    "\n",
+    "* **A different PEFT technique**: The `peft.peft_scheme` parameter determines the technique being used. In this case, we did LoRA, but NeMo Framework supports other techniques as well - such as P-tuning, Adapters, and IA3. For more information, refer to the [PEFT support matrix](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/peft/landing_page.html). For example, for P-tuning, simply set \n",
+    "\n",
+    "```bash\n",
+    "model.peft.peft_scheme=\"ptuning\" # instead of \"lora\"\n",
+    "```\n",
+    "\n",
+    "* **Tuning Llama-3 70B**: You will need 8xA100 or 8xH100 GPUs. Provide the path to it's .nemo checkpoint (similar to the download and conversion steps earlier), and change the model parallelization settings for Llama-3 70B PEFT to distribute across the GPUs. It is also recommended to run the fine-tuning script from a terminal directly instead of Jupyter when using more than 1 GPU.\n",
+    "```bash\n",
+    "model.tensor_model_parallel_size=8\n",
+    "model.pipeline_model_parallel_size=1\n",
+    "```\n",
+    "\n",
+    "You can override many such configurations while running the script. A full set of possible configurations is located in [NeMo Framework Github](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53979a4d",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Step 5: Inference with NeMo Framework\n",
+    "\n",
+    "Running text generation within the framework is also possible with running a Python script. Note that is more for testing and validation, not a full-fledged  deployment solution like NVIDIA NIM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00d1e3f8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Check that the LORA model file exists\n",
+    "!ls -l ./results/Meta-Llama-3-8B-Instruct/checkpoints"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3430a0b0-05a0-4179-8750-151d492bb9ae",
+   "metadata": {},
+   "source": [
+    "In the code snippet below, the following configurations are worth noting - \n",
+    "\n",
+    "1. `model.restore_from_path` to the path for the Meta-Llama-3-8B-Instruct.nemo file.\n",
+    "2. `model.peft.restore_from_path` to the path for the PEFT checkpoint that was created in the fine-tuning run in the last step.\n",
+    "3. `model.test_ds.file_names` to the path of the pubmedqa_test.jsonl file\n",
+    "\n",
+    "If you have made any changes in model or experiment paths, please ensure they are configured correctly below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "568eb35d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "MODEL=\"./Meta-Llama-3-8B-Instruct.nemo\"\n",
+    "TEST_DS=\"[./pubmedqa/data/pubmedqa_test.jsonl]\"\n",
+    "TEST_NAMES=\"[pubmedqa]\"\n",
+    "SCHEME=\"lora\"\n",
+    "TP_SIZE=1\n",
+    "PP_SIZE=1\n",
+    "\n",
+    "# This is where your LoRA checkpoint was saved\n",
+    "PATH_TO_TRAINED_MODEL=\"./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo\"\n",
+    "\n",
+    "# The generation run will save the generated outputs over the test dataset in a file prefixed like so\n",
+    "OUTPUT_PREFIX=\"pubmedQA_result_\"\n",
+    "\n",
+    "python /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \\\n",
+    "    model.restore_from_path=${MODEL} \\\n",
+    "    model.peft.restore_from_path=${PATH_TO_TRAINED_MODEL} \\\n",
+    "    trainer.devices=1 \\\n",
+    "    trainer.num_nodes=1 \\\n",
+    "    model.data.test_ds.file_names=${TEST_DS} \\\n",
+    "    model.data.test_ds.names=${TEST_NAMES} \\\n",
+    "    model.data.test_ds.global_batch_size=1 \\\n",
+    "    model.data.test_ds.micro_batch_size=1 \\\n",
+    "    model.data.test_ds.tokens_to_generate=3 \\\n",
+    "    model.tensor_model_parallel_size=${TP_SIZE} \\\n",
+    "    model.pipeline_model_parallel_size=${PP_SIZE} \\\n",
+    "    inference.greedy=True \\\n",
+    "    model.data.test_ds.output_file_path_prefix=${OUTPUT_PREFIX} \\\n",
+    "    model.data.test_ds.write_predictions_to_file=True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fe048f9",
+   "metadata": {},
+   "source": [
+    "### Step 6: Check the model accuracy\n",
+    "\n",
+    "Now that the results are in, let's read the results and calculate the accuracy on the pubmedQA task. You can compare your accuracy results with the public leaderboard at https://pubmedqa.github.io/.\n",
+    "\n",
+    "Let's take a look at one of the predictions in the generated output file. The `pred` key indicates what was generated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa5c0fdc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!tail -n 1 pubmedQA_result__test_pubmedqa_inputs_preds_labels.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1c91df7",
+   "metadata": {},
+   "source": [
+    "Note that the model produces output in the specified format, such as `<<< no >>>`.\n",
+    "\n",
+    "The following snippet loads the generated output and calculates accuracy in comparison to the test set using the `evaluation.py` script included in the PubMedQA repo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "900f81c2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "answers = []\n",
+    "with open(\"pubmedQA_result__test_pubmedqa_inputs_preds_labels.jsonl\",'rt') as f:\n",
+    "    st = f.readline()\n",
+    "    while st:\n",
+    "        answers.append(json.loads(st))\n",
+    "        st = f.readline()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74e1bbce",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "data_test = json.load(open(\"./pubmedqa/data/test_set.json\",'rt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a85926e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "sample_id = list(data_test.keys())\n",
+    "\n",
+    "for i, key in enumerate(sample_id):\n",
+    "    answer = answers[i]['pred']\n",
+    "    if 'yes' in answer:\n",
+    "        results[key] = 'yes'\n",
+    "    elif 'no' in answer:\n",
+    "        results[key] = 'no'\n",
+    "    elif 'maybe' in answer:\n",
+    "        results[key] = 'maybe'\n",
+    "    else:\n",
+    "        print(\"Malformed answer: \", answer)\n",
+    "        results[key] = 'maybe'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fea1a217",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Dump results in a format that can be ingested by PubMedQA evaluation file\n",
+    "FILENAME=\"pubmedqa-llama-3-8b-lora.json\"\n",
+    "with(open(FILENAME, \"w\")) as f:\n",
+    "    json.dump(results, f)\n",
+    "\n",
+    "# Evaluation\n",
+    "!cp $FILENAME ./pubmedqa/\n",
+    "!cd ./pubmedqa/ && python evaluation.py $FILENAME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9909283e-e1f8-450e-a730-403e22f621ad",
+   "metadata": {},
+   "source": [
+    "For the Llama-3-8B-Instruct model, you should see accuracy comparable to the below:\n",
+    "```\n",
+    "Accuracy 0.786000\n",
+    "Macro-F1 0.550305\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index e506bbd4d4b4..b3a38b8b5ec2 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -67,7 +67,7 @@
     "\n",
     "This notebook will show you how to prepare an image-text dataset into the [WebDataset](https://github.com/webdataset/webdataset) format. The Webdataset format is required to train all multimodal models in NeMo, such as Stable Diffusion and Imagen. \n",
     "\n",
-    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Megatron-Launcher on multiple processors/GPUs. NeMo-Megatron-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
+    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Framework-Launcher on multiple processors/GPUs. NeMo-Framework-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
     "\n",
     "Depending on your use case, not all 5 stages need to be run. Please go to [NeMo Multimodal Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/multimodal/text2img/datasets.html) for an overview of the 5 stages.\n",
     "    \n",
@@ -85,7 +85,7 @@
    "source": [
     "import os\n",
     "\n",
-    "LAUNCHER_DIR = \"/opt/NeMo-Megatron-Launcher\"\n",
+    "LAUNCHER_DIR = \"/opt/NeMo-Framework-Launcher\"  # formerly NeMo-Megatron-Launcher\n",
     "SCRIPT_DIR = os.path.join(LAUNCHER_DIR, \"launcher_scripts/nemo_launcher/collections/dataprep_scripts/multimodal_dataprep\")\n",
     "CONF_DIR = \"conf\"\n",
     "DATA_DIR = \"dummy_data\"\n",
@@ -168,7 +168,7 @@
     "\n",
     "Script: download_images.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to the number of parquet files in `$DATA_DIR/parquet/dummy_dataset50000.parquet_parts`. (i.e. `parquet_subpartitions` x `num_parquets_downloaded`)\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= SLURM_ARRAY_TASK_ID < SLURM_ARRAY_TASK_COUNT)\n",
     "\n",
@@ -266,7 +266,7 @@
     "\n",
     "Script: reorganize_tar.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -430,7 +430,7 @@
    },
    "outputs": [],
    "source": [
-    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Megatron-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
+    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
    ]
   },
   {
@@ -506,7 +506,7 @@
     "\n",
     "Script: precache_encodings.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -533,15 +533,6 @@
     "    precache_config_path=$CONF_DIR/precache_sd_example.yaml"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "If you encounter a nemo import problem with the cell above, please also running it in the terminal directly."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb
index 20b5e5a1c82c..5e2607dcd801 100644
--- a/tutorials/multimodal/NeVA Tutorial.ipynb	
+++ b/tutorials/multimodal/NeVA Tutorial.ipynb	
@@ -295,7 +295,7 @@
     "trainer.devices=1 \\\n",
     "trainer.precision=bf16 \\\n",
     "prompt_file=/path/to/prompt/file \\\n",
-    "inference.images_base_path=/path/to/image \\\n",
+    "inference.media_base_path=/path/to/image \\\n",
     "output_file=path/for/output/file/ \\\n",
     "inference.temperature=0.2 \\\n",
     "inference.top_k=0 \\\n",
@@ -304,7 +304,7 @@
     "inference.add_BOS=False \\\n",
     "inference.all_probs=False \\\n",
     "inference.repetition_penalty=1.2 \\\n",
-    "inference.insert_image_token=null \\\n",
+    "inference.insert_media_token=null \\\n",
     "inference.tokens_to_generate=256 \\\n",
     "quantization.algorithm=awq \\\n",
     "quantization.enable=False"
@@ -322,7 +322,7 @@
     "##### Inference Config Setup\n",
     "1. Modify `fw_inference` within `defaults` to use `neva/inference` \n",
     "2. In `stages`, ensure that `fw_inference` is included\n",
-    "3. Within the `inference.yaml` default NeVA inference config file, ensure that the path to the `prompt` file, `neva_model_file`, and `images_base_path` within `inference` are specified.\n",
+    "3. Within the `inference.yaml` default NeVA inference config file, ensure that the path to the `prompt` file, `neva_model_file`, and `media_base_path` within `inference` are specified.\n",
     "\n",
     "Once either the necessary checkpoints have been loaded or the training workflow is complete, inference can be executed within the launcher pipeline with the following command:"
    ]
diff --git a/tutorials/multimodal/SDXL Quantization.ipynb b/tutorials/multimodal/SDXL Quantization.ipynb
new file mode 100644
index 000000000000..1562a9c756ee
--- /dev/null
+++ b/tutorials/multimodal/SDXL Quantization.ipynb	
@@ -0,0 +1,851 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b32d3842",
+   "metadata": {},
+   "source": [
+    "# SDXL Int8 Quantization Solution by Ammo\n",
+    "\n",
+    "### Note:\n",
+    "This notebook requires nvidia-ammo > 0.9.x, which comes with NeMo framework container > 23.05. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "This tutorial shows how to use Ammo to calibrate and quantize the UNet part of the SDXL within NeMo framework. \n",
+    "\n",
+    "Please note that NeMo provides users with an end-to-end training framework for SDXL, and this quantization pipeline is supposed to work with a `.nemo` checkpoint trained from their own text-image dataset. In this tutorial, a open-source checkpoint is converted to `.nemo` format for illustration purpose."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f8320ca",
+   "metadata": {},
+   "source": [
+    "### Download SDXL checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd436eab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Download Unet checkpoint\n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/unet/diffusion_pytorch_model.safetensors\n",
+    "## Download Vae checkpoint  \n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70164e82",
+   "metadata": {},
+   "source": [
+    "### Convert downloaded checkpoint into `.nemo` format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c9649553",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo I 2024-04-24 22:13:11 distributed:42] Initializing torch.distributed with local_rank: 0, rank: 0, world_size: 1\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[rank0]:[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:25 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "open_clip_pytorch_model.bin: 100%|██████████| 10.2G/10.2G [01:36<00:00, 106MB/s]\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "AutoencoderKLInferenceWrapper: Following keys are missing during loading VAE weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.\n",
+      "Missing:['encoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.proj_out.weight', 'decoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.q.weight', 'decoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.q.weight', 'encoder.mid.attn_1.k.weight', 'encoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.k.weight', 'decoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.proj_out.weight', 'encoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.k.bias', 'decoder.mid.attn_1.k.bias']\n",
+      "Unexpected:['encoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_out.0.weight', 'encoder.mid.attentions.0.to_v.bias', 'decoder.mid.attentions.0.to_q.bias', 'encoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_v.bias', 'encoder.mid.attentions.0.to_k.bias', 'encoder.mid.attentions.0.to_out.0.bias', 'decoder.mid.attentions.0.to_out.0.bias', 'encoder.mid.attentions.0.to_out.0.weight', 'decoder.mid.attentions.0.to_k.bias', 'decoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_q.bias']\n",
+      "[NeMo I 2024-04-24 22:15:42 convert_hf_ckpt_to_nemo:226] NeMo model saved to: /quantization/sdxl_base.nemo\n"
+     ]
+    }
+   ],
+   "source": [
+    "WORKDIR = '/quantization'\n",
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py \\\n",
+    "    --model_type sdxl \\\n",
+    "    --ckpt_path /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors \\\n",
+    "    --hparams_file /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml \\\n",
+    "    --nemo_file_path $WORKDIR/sdxl_base.nemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25817b85",
+   "metadata": {},
+   "source": [
+    "### Run quantization script with default config, and finally the script will export the quantized unet to onnx file.\n",
+    "\n",
+    "##### Quantization config\n",
+    "\n",
+    "```yaml\n",
+    "quantize\n",
+    "  exp_name: nemo_test\n",
+    "  n_steps: 20          # number of inference steps\n",
+    "  format: 'int8'       # only int8 quantization is supported now\n",
+    "  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0\n",
+    "  batch_size: 1        # batch size calling sdxl inference pipeline during calibration\n",
+    "  calib_size: 32       # For SDXL, we recommend 32, 64 or 128\n",
+    "  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.\n",
+    "  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL\n",
+    "```\n",
+    "\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "onnx_export:\n",
+    "  onnx_dir: nemo_onnx    # Path to save onnx files\n",
+    "  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl\n",
+    "  quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt  # Path to save quantized unet checkpoint\n",
+    "  format: int8\n",
+    "```\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "trt_export:\n",
+    "  static_batch: False # static batch engines have better latency\n",
+    "  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch\n",
+    "  max_batch_size: 1   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch\n",
+    "  int8: True          # Allow engine builder recognize int8 precision\n",
+    "  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time\n",
+    "  trt_engine: int8_unet_xl.plan  # path to save trt engine\n",
+    "```\n",
+    "\n",
+    "The following command restores a pre-trained sdxl model from `$WORKDIR/sdxl_base.nemo` derived from the above step.\n",
+    "The quantized U-Net checkpoint is saved to `quantize.quantized_ckpt`, converted onnx file is saved to `onnx_export.onnx_dir` and trt engine is saved to `trt_export.trt_engine`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d955f6c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 19:43:09 - PID:1361 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 19:43:53 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "Building TensorRT engine for /quantization/nemo_onnx/unet.onnx: /quantization/int8_unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(4, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(4, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(4,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(4, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;11m[W] It looks like some layers in the network have compute precision set, but precision constraints were not enabled. \n",
+      "    Precision constraints must be set to 'prefer' or 'obey' for layer compute precision to take effect. \n",
+      "    Note: Layers and their requested precisions were: {'/input_blocks.0/input_blocks.0.0/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.0/input_blocks.0.0/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/out/out.1/input_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/input_quantizer/DequantizeLinear': 'INT8', '/out/out.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/weight_quantizer/DequantizeLinear': 'INT8'}\u001B[0m\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16, INT8]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 881.973 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/int8_unet_xl.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py model.restore_from_path=$WORKDIR/sdxl_base.nemo onnx_export.onnx_dir=$WORKDIR/nemo_onnx quantize.quantized_ckpt=$WORKDIR/nemo.unet.state_dict.nemo.pt trt_export.trt_engine=$WORKDIR/int8_unet_xl.plan\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f97d6bfa",
+   "metadata": {},
+   "source": [
+    "### Build end to end TRT inference pipeline\n",
+    "In order to run an end to end inference with quantized U-Net engine, we need to export and build engines for the other compenents in SDXL, which includes the VAE and two CLIP encoder. The following script restores SDXL from the `nemo` checkpoint and saves the corresponding engine files to `infer.out_path`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e8b7742",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 22:17:50 - PID:703 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 22:18:35 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:1184: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert y.shape[0] == x.shape[0]\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:209: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:37 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:17 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py:172: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      w_ = w_ * (int(c) ** (-0.5))\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/utils.py:2095: UserWarning: Provided key z_pooled for dynamic axes is not a valid input/output name\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:86: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if input_shape[-1] > 1 or self.sliding_window is not None:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if past_key_values_length > 0:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:281: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:289: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:321: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:27 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset9.py:5859: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "Building TensorRT engine for /quantization/onnx/unet_xl/unet_xl.onnx: /quantization/plan/unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(2, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(2,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(2, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;11m[W] Detected layernorm nodes in FP16.\u001B[0m\n",
+      "\u001B[38;5;11m[W] Running layernorm after self-attention in FP16 may cause overflow. Exporting the model to the latest available ONNX opset (later than opset 17) to use the INormalizationLayer, or forcing layernorm layers to run in FP32 precision can help with preserving accuracy.\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 553.937 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/unet_xl.plan\n",
+      "Building TensorRT engine for /quantization/onnx/vae/vae.onnx: /quantization/plan/vae.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {z [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | []\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 266.743 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/vae.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip1/clip1.onnx: /quantization/plan/clip1.plan\n",
+      "\u001B[38;5;11m[W] ModelImporter.cpp:409: Make sure input input_ids has Int64 binding.\u001B[0m\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 16.988 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip1.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip2/clip2.onnx: /quantization/plan/clip2.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 72.535 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip2.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_export.py model.restore_from_path=$WORKDIR/sdxl_base.nemo infer.out_path=$WORKDIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7eb2d03",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with original engines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25737be2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:46:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "Loading TensorRT engine: /quantization/plan/unet_xl.plan\n",
+      "[I] Loading bytes from /quantization/plan/unet_xl.plan\n",
+      "unet_xl trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/vae.plan\n",
+      "[I] Loading bytes from /quantization/plan/vae.plan\n",
+      "vae trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip1.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip1.plan\n",
+      "clip1 trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip2.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip2.plan\n",
+      "clip2 trt engine loaded successfully\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:24<00:00,  1.60it/\n",
+      "This batch takes 27.204587490297854s\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.57it/\n",
+      "This batch takes 25.58329666685313s\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.55it/\n",
+      "This batch takes 25.87396944500506s\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.54it/\n",
+      "This batch takes 26.03419069480151s\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  71%|▋| 29/41 [00:18<00:07,  1.52it/"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_fp16 \\\n",
+    "    unet_xl=$WORKDIR/plan/unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d11bbe7d",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with quantized U-Net engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3f2263b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "^C\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3109, in _dep_map\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1300 closing signal SIGINT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2902, in __getattr__\n",
+      "    raise AttributeError(attr)\n",
+      "AttributeError: _DistInfoDistribution__dep_map\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py\", line 25, in <module>\n",
+      "    from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py\", line 17, in <module>\n",
+      "    from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, instantiate_from_config\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/parts/stable_diffusion/utils.py\", line 25, in <module>\n",
+      "    from nemo.utils import logging\n",
+      "  File \"/opt/NeMo/nemo/utils/__init__.py\", line 31, in <module>\n",
+      "    from nemo.utils.lightning_logger_patch import add_memory_handlers_to_pl_logger\n",
+      "  File \"/opt/NeMo/nemo/utils/lightning_logger_patch.py\", line 18, in <module>\n",
+      "    import pytorch_lightning as pl\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/__init__.py\", line 27, in <module>\n",
+      "    from pytorch_lightning.callbacks import Callback  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/__init__.py\", line 14, in <module>\n",
+      "    from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/batch_size_finder.py\", line 26, in <module>\n",
+      "    from pytorch_lightning.callbacks.callback import Callback\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/callback.py\", line 22, in <module>\n",
+      "    from pytorch_lightning.utilities.types import STEP_OUTPUT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/types.py\", line 41, in <module>\n",
+      "    from torchmetrics import Metric\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/__init__.py\", line 22, in <module>\n",
+      "    from torchmetrics import functional  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/__init__.py\", line 121, in <module>\n",
+      "    from torchmetrics.functional.text._deprecated import _bleu_score as bleu_score\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/text/__init__.py\", line 49, in <module>\n",
+      "    if _TRANSFORMERS_GREATER_EQUAL_4_4:\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 164, in __bool__\n",
+      "    self._check_available()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 158, in _check_available\n",
+      "    self._check_requirement()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 132, in _check_requirement\n",
+      "    pkg_resources.require(self.requirement)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 968, in require\n",
+      "    needed = self.resolve(parse_requirements(requirements))\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 834, in resolve\n",
+      "    new_requirements = dist.requires(req.extras)[::-1]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2822, in requires\n",
+      "    dm = self._dep_map\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3111, in _dep_map\n",
+      "    self.__dep_map = self._compute_dependencies()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in _compute_dependencies\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in <listcomp>\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3125, in reqs_for_extra\n",
+      "    if not req.marker or req.marker.evaluate({'extra': extra}):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 252, in evaluate\n",
+      "    return _evaluate_markers(self._markers, current_environment)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in _evaluate_markers\n",
+      "    return any(all(item) for item in groups)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in <genexpr>\n",
+      "    return any(all(item) for item in groups)\n",
+      "KeyboardInterrupt\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_int8 \\\n",
+    "    unet_xl=$WORKDIR/int8_unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c48c21dd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
index 675fdfd5351c..608685254a0d 100644
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -749,7 +749,7 @@
             "source": [
                 "### Optimizing Threshold\n",
                 "\n",
-                "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
+                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
                 "\n",
                 "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
             ]
diff --git a/tutorials/tts/Audio_Codec_Training.ipynb b/tutorials/tts/Audio_Codec_Training.ipynb
new file mode 100644
index 000000000000..5f42fd73aa2c
--- /dev/null
+++ b/tutorials/tts/Audio_Codec_Training.ipynb
@@ -0,0 +1,800 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7X-TwhdTGmlc"
+      },
+      "source": [
+        "# License"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fCQUeZRPGnoe"
+      },
+      "source": [
+        "> Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n",
+        ">\n",
+        "> Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
+        ">\n",
+        "> http://www.apache.org/licenses/LICENSE-2.0\n",
+        ">\n",
+        "> Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rtBDkKqVGZJ8"
+      },
+      "source": [
+        "# Introduction"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pZ2QSsXuGbMe"
+      },
+      "source": [
+        "In this tutorial we show how to use NeMo to train and fine-tune **neural audio codecs**.\n",
+        "\n",
+        "Neural audio codecs are deep learning models that compress audio into a low bitrate representation. The compact embedding space created by these models can be useful for various speech tasks, such as TTS and ASR.\n",
+        "\n",
+        "<div>\n",
+        "<img src=\"https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/nemo_audio_codec.png\" width=\"800\", height=\"400\"/>\n",
+        "</div>\n",
+        "\n",
+        "Audio codec models typically have an *encoder-quantizer-decoder* structure. The **encoder** takes an input audio signal and encodes it into a sequence of embeddings. The **quantizer** discretizes the embeddings to create a lookup table known as a **codebook**. The embeddings saved in the codebook are referred to as **audio codes**. The **decoder** takes the audio codes as input and attempts to reconstruct the original audio signal.\n",
+        "\n",
+        "To store compressed audio we only need to save the codebook index for each embedding in an audio sequence. This is how audio codec models achieve low bitrates. The codebook indices for an audio are referred to **audio tokens**. It is becoming common for speech generation models to synthesize speech by predicting audio tokens.\n",
+        "\n",
+        "In NeMo we have implementations of the [SEANet encoder and decoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/encodec_modules.py#L146)  used by [EnCodec](https://github.com/facebookresearch/encodec). As well as a [ResNet encoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L1035) and [HiFi-GAN decoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L875). For quantizers we support [Residual Vector Quantizer](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/encodec_modules.py#L694) (**RVQ**) and [Finite Scalar Quantizer](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L409) (**FSQ**).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3OZassNG5xff"
+      },
+      "source": [
+        "# Install"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WZvQvPkIhRi3"
+      },
+      "outputs": [],
+      "source": [
+        "BRANCH = 'main'\n",
+        "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below line\n",
+        "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
+        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pathlib import Path"
+      ],
+      "metadata": {
+        "id": "v8NGOM0EzK8W"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tvsgWO_WhV3M"
+      },
+      "outputs": [],
+      "source": [
+        "# Directory where tutorialscripts will run and outputs will be saved.\n",
+        "ROOT_DIR = Path().absolute() / \"codec_tutorial\"\n",
+        "\n",
+        "# Nemo code paths\n",
+        "NEMO_DIR = ROOT_DIR / \"nemo\"\n",
+        "NEMO_SCRIPT_DIR = NEMO_DIR / \"scripts\" / \"dataset_processing\" / \"tts\"\n",
+        "NEMO_EXAMPLES_DIR = NEMO_DIR / \"examples\" / \"tts\"\n",
+        "NEMO_CONFIG_DIR = NEMO_EXAMPLES_DIR / \"conf\"\n",
+        "\n",
+        "nemo_download_dir = str(NEMO_DIR)\n",
+        "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
+        "# comment out the below line and set NEMO_ROOT_DIR to your local path.\n",
+        "!git clone -b $BRANCH https://github.com/NVIDIA/NeMo.git $nemo_download_dir"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KAbH7N427FdT"
+      },
+      "source": [
+        "# Configuration"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Predefined model configurations are available in https://github.com/NVIDIA/NeMo/tree/main/examples/tts/conf/audio_codec.\n",
+        "\n",
+        "Configurations available include:\n",
+        "\n",
+        "*   **audio_codec_*.yaml**: Audio codec configurations optimized for various sampling rates.\n",
+        "*   **mel_codec_*.yaml**: A mel-spectrogram based codec designed to maximize the performance of speech synthesis models.\n",
+        "*   **encodec_*.yaml**: A reproduction of the original [EnCodec](https://arxiv.org/abs/2210.13438) model setup.\n",
+        "\n",
+        "This tutorial can be run with any of our predefined configs. As a default we have selected `audio_codec_16000.yaml`, which works for 16kHz audio."
+      ],
+      "metadata": {
+        "id": "ODgdGgsAAUku"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from omegaconf import OmegaConf"
+      ],
+      "metadata": {
+        "id": "SPtjS2LkzE9Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "CONFIG_FILENAME = \"audio_codec_16000.yaml\"\n",
+        "CONFIG_DIR = NEMO_CONFIG_DIR / \"audio_codec\"\n",
+        "\n",
+        "config_filepath = CONFIG_DIR / CONFIG_FILENAME\n",
+        "\n",
+        "if not config_filepath.exists():\n",
+        "  raise ValueError(f\"Config file does not exist {config_filepath}\")"
+      ],
+      "metadata": {
+        "id": "iCPJFKg63Dsv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Read model name and sample rate from model configuration\n",
+        "omega_conf = OmegaConf.load(config_filepath)\n",
+        "MODEL_NAME = omega_conf.name\n",
+        "SAMPLE_RATE = omega_conf.sample_rate\n",
+        "print(f\"Training {MODEL_NAME} with sample rate {SAMPLE_RATE}\")"
+      ],
+      "metadata": {
+        "id": "QE0HYh7FjAR3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We provide pretrained model checkpoints for fine-tuning. The list of available models can be found [here](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/audio_codec.py#L645)."
+      ],
+      "metadata": {
+        "id": "W7F--_0maLh5"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import wget\n",
+        "from nemo.collections.tts.models.audio_codec import AudioCodecModel\n",
+        "\n",
+        "# Optionally specify a pretrained model to fine-tune from. To train from scratch, set this to 'None'.\n",
+        "pretrained_model_name = \"audio_codec_16khz_small\"\n",
+        "\n",
+        "if pretrained_model_name is None:\n",
+        "  MODEL_CHECKPOINT_PATH = None\n",
+        "else:\n",
+        "  model_list = AudioCodecModel.list_available_models()\n",
+        "\n",
+        "  pretrained_model_url = None\n",
+        "  for model in model_list:\n",
+        "    if model.pretrained_model_name == pretrained_model_name:\n",
+        "      pretrained_model_url = model.location\n",
+        "      break\n",
+        "\n",
+        "  if pretrained_model_url is None:\n",
+        "    raise ValueError(f\"Could not find pretrained model {pretrained_model_name}. Models available {model_list}\")\n",
+        "\n",
+        "  # Optionally load pretrained checkpoint\n",
+        "  MODEL_CHECKPOINT_PATH = ROOT_DIR / \"models\" / f\"{pretrained_model_name}.nemo\"\n",
+        "\n",
+        "  if not MODEL_CHECKPOINT_PATH.exists():\n",
+        "      print(f\"Downloading {pretrained_model_url} to {MODEL_CHECKPOINT_PATH}\")\n",
+        "      MODEL_CHECKPOINT_PATH.parent.mkdir(exist_ok=True)\n",
+        "      wget.download(pretrained_model_url, out=str(MODEL_CHECKPOINT_PATH))"
+      ],
+      "metadata": {
+        "id": "XqAYWR65aKTx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fM4QPsLTnzK7"
+      },
+      "source": [
+        "# Dataset Preparation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tkZC6Dl7KRl6"
+      },
+      "source": [
+        "For our tutorial, we use a subset of [VCTK](https://datashare.ed.ac.uk/handle/10283/2950) dataset with 5 speakers (p225-p229)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sYzvAYr2vo1K"
+      },
+      "outputs": [],
+      "source": [
+        "import tarfile\n",
+        "\n",
+        "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aoxN1QsUzX-k"
+      },
+      "outputs": [],
+      "source": [
+        "# Create dataset directory\n",
+        "DATA_DIR = ROOT_DIR / \"data\"\n",
+        "\n",
+        "DATA_DIR.mkdir(parents=True, exist_ok=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mArlQd5Hk36b"
+      },
+      "outputs": [],
+      "source": [
+        "# Download the dataset\n",
+        "dataset_url = \"https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz\"\n",
+        "dataset_tar_filepath = DATA_DIR / \"vctk.tar.gz\"\n",
+        "\n",
+        "if not dataset_tar_filepath.exists():\n",
+        "    wget.download(dataset_url, out=str(dataset_tar_filepath))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p987cjtOy9C7"
+      },
+      "outputs": [],
+      "source": [
+        "# Extract the dataset\n",
+        "with tarfile.open(dataset_tar_filepath) as tar_f:\n",
+        "    tar_f.extractall(DATA_DIR)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ko6dxYJW0i3G"
+      },
+      "outputs": [],
+      "source": [
+        "DATASET_DIR = DATA_DIR / \"vctk_subset_multispeaker\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "We5FHYQt5BeO"
+      },
+      "outputs": [],
+      "source": [
+        "# Visualize the raw dataset\n",
+        "train_raw_filepath = DATASET_DIR / \"train.json\"\n",
+        "!head $train_raw_filepath"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i3jsk2HCMSU5"
+      },
+      "source": [
+        "## Manifest Processing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "N8WuAGJsMHRn"
+      },
+      "source": [
+        "The downloaded manifest is formatted for TTS training, which contains metadata such as text and speaker.\n",
+        "\n",
+        "For codec training we need `audio_filepath`. The `audio_filepath` field can either be an *absolute path*, or a *relative path* with the root directory provided as an argument to each script. Here we use relative paths.\n",
+        "\n",
+        "If you include `duration` the training script will automatically calculate the total size of each dataset used, and can be useful for filtering based on utterance length."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zoCRrKQ20VZP"
+      },
+      "outputs": [],
+      "source": [
+        "def update_manifest(data_type):\n",
+        "    input_filepath = DATASET_DIR / f\"{data_type}.json\"\n",
+        "    output_filepath = DATASET_DIR / f\"{data_type}_raw.json\"\n",
+        "\n",
+        "    entries = read_manifest(input_filepath)\n",
+        "    new_entries = []\n",
+        "    for entry in entries:\n",
+        "        # Provide relative path instead of absolute path\n",
+        "        audio_filepath = entry[\"audio_filepath\"].replace(\"audio/\", \"\")\n",
+        "        duration = round(entry[\"duration\"], 2)\n",
+        "        new_entry = {\n",
+        "            \"audio_filepath\": audio_filepath,\n",
+        "            \"duration\": duration\n",
+        "        }\n",
+        "        new_entries.append(new_entry)\n",
+        "\n",
+        "    write_manifest(output_path=output_filepath, target_manifest=new_entries, ensure_ascii=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PaCc3GCG1UbH"
+      },
+      "outputs": [],
+      "source": [
+        "update_manifest(\"dev\")\n",
+        "update_manifest(\"train\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bVLIB3Ip1Aqn"
+      },
+      "outputs": [],
+      "source": [
+        "# Visualize updated 'audio_filepath' field.\n",
+        "train_filepath = DATASET_DIR / \"train_raw.json\"\n",
+        "!head $train_filepath"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "alrRDWio41qi"
+      },
+      "source": [
+        "## Audio Preprocessing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4WfEaMwpUsFt"
+      },
+      "source": [
+        "Next we process the audio data using [preprocess_audio.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/preprocess_audio.py).\n",
+        "\n",
+        "During this step we can apply the following transformations:\n",
+        "\n",
+        "1. Resample the audio from 48khz to the target sample rate for codec training.\n",
+        "2. Remove long silence from the beginning and end of each audio file. This can be done using an *energy* based approach which will work on clean audio, or using *voice activity detection (VAD)* which is slower but also works on audio with background or static noise (eg. from a microphone). Here we suggest VAD because some audio in VCTK has background noise."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WEvIefjnd7AG"
+      },
+      "outputs": [],
+      "source": [
+        "import IPython.display as ipd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-qEuCH8S4vFP"
+      },
+      "outputs": [],
+      "source": [
+        "# Python wrapper to invoke the given bash script with the given input args\n",
+        "def run_script(script, args):\n",
+        "    args = ' \\\\'.join(args)\n",
+        "    cmd = f\"python {script} \\\\{args}\"\n",
+        "\n",
+        "    print(cmd.replace(\" \\\\\", \"\\n\"))\n",
+        "    print()\n",
+        "    !$cmd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0kQ1UDnGfdX6"
+      },
+      "outputs": [],
+      "source": [
+        "audio_preprocessing_script = NEMO_SCRIPT_DIR / \"preprocess_audio.py\"\n",
+        "\n",
+        "# Directory with raw audio data\n",
+        "input_audio_dir = DATASET_DIR / \"audio\"\n",
+        "# Directory to write preprocessed audio to\n",
+        "output_audio_dir = DATASET_DIR / \"audio_preprocessed\"\n",
+        "# Whether to overwrite existing audio, if it exists in the output directory\n",
+        "overwrite_audio = True\n",
+        "# Whether to overwrite output manifest, if it exists\n",
+        "overwrite_manifest = True\n",
+        "# Number of threads to parallelize audio processing across\n",
+        "num_workers = 4\n",
+        "# Format of output audio files. Use \"flac\" to compress to a smaller file size.\n",
+        "output_format = \"flac\"\n",
+        "# Method for silence trimming. Can use \"energy.yaml\" or \"vad.yaml\".\n",
+        "trim_config_path = NEMO_CONFIG_DIR / \"trim\" / \"vad.yaml\"\n",
+        "\n",
+        "def preprocess_audio(data_type):\n",
+        "    input_filepath = DATASET_DIR / f\"{data_type}_raw.json\"\n",
+        "    output_filepath = DATASET_DIR / f\"{data_type}_manifest.json\"\n",
+        "\n",
+        "    args = [\n",
+        "        f\"--input_manifest={input_filepath}\",\n",
+        "        f\"--output_manifest={output_filepath}\",\n",
+        "        f\"--input_audio_dir={input_audio_dir}\",\n",
+        "        f\"--output_audio_dir={output_audio_dir}\",\n",
+        "        f\"--num_workers={num_workers}\",\n",
+        "        f\"--output_sample_rate={SAMPLE_RATE}\",\n",
+        "        f\"--output_format={output_format}\",\n",
+        "        f\"--trim_config_path={trim_config_path}\"\n",
+        "    ]\n",
+        "    if overwrite_manifest:\n",
+        "        args.append(\"--overwrite_manifest\")\n",
+        "    if overwrite_audio:\n",
+        "        args.append(\"--overwrite_audio\")\n",
+        "\n",
+        "    run_script(audio_preprocessing_script, args)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ai0zbXSOriuY"
+      },
+      "outputs": [],
+      "source": [
+        "preprocess_audio(\"dev\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NUKnidQYfgDo"
+      },
+      "outputs": [],
+      "source": [
+        "preprocess_audio(\"train\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x2yhJtsj2lDR"
+      },
+      "source": [
+        "Before we proceed, it is important to verify that the audio processing works as expected. Let's listen to an audio file before and after processing.\n",
+        "\n",
+        "Note that the processed audio is shorter because we trimmed the leading and trailing silence."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!ls $processed_audio_filepath"
+      ],
+      "metadata": {
+        "id": "AfdHUHAWuF-G"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_fM3GwJxkjOA"
+      },
+      "outputs": [],
+      "source": [
+        "audio_file = \"p228_009.wav\"\n",
+        "audio_filepath = input_audio_dir / audio_file\n",
+        "processed_audio_filepath = output_audio_dir / audio_file.replace(\".wav\", \".flac\")\n",
+        "\n",
+        "print(\"Original audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath, rate=SAMPLE_RATE))\n",
+        "\n",
+        "print(\"Processed audio.\")\n",
+        "ipd.display(ipd.Audio(processed_audio_filepath, rate=SAMPLE_RATE))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oRO842MUyODC"
+      },
+      "source": [
+        "# Audio Codec Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E4wUKYOfH8ax"
+      },
+      "source": [
+        "Here we show how to train an audio codec model from scratch. Instructions and checkpoints for fine-tuning will be provided later.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pqfl9jAYMJob"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import torch\n",
+        "from omegaconf import OmegaConf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jK2rr-Kr6Qg8"
+      },
+      "outputs": [],
+      "source": [
+        "dataset_name = \"vctk\"\n",
+        "audio_dir = DATASET_DIR / \"audio_preprocessed\"\n",
+        "train_manifest_filepath = DATASET_DIR / \"train_manifest.json\"\n",
+        "dev_manifest_filepath = DATASET_DIR / \"dev_manifest.json\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Vr4D-NB-yQx8"
+      },
+      "outputs": [],
+      "source": [
+        "audio_codec_training_script = NEMO_EXAMPLES_DIR / \"audio_codec.py\"\n",
+        "\n",
+        "# The total number of training steps will be (epochs * steps_per_epoch)\n",
+        "epochs = 10\n",
+        "steps_per_epoch = 10\n",
+        "\n",
+        "# Name of the experiment that will determine where it is saved locally and in TensorBoard and WandB\n",
+        "run_id = \"test_run\"\n",
+        "exp_dir = ROOT_DIR / \"exps\"\n",
+        "codec_exp_output_dir = exp_dir / MODEL_NAME / run_id\n",
+        "# Directory where predicted audio will be stored periodically throughout training\n",
+        "codec_log_dir = codec_exp_output_dir / \"logs\"\n",
+        "# Optionally log visualization of learned codes.\n",
+        "log_dequantized = True\n",
+        "# Optionally log predicted audio and other artifacts to WandB\n",
+        "log_to_wandb = False\n",
+        "# Optionally log predicted audio and other artifacts to Tensorboard\n",
+        "log_to_tensorboard = False\n",
+        "\n",
+        "if torch.cuda.is_available():\n",
+        "    accelerator=\"gpu\"\n",
+        "    batch_size = 4\n",
+        "else:\n",
+        "    accelerator=\"cpu\"\n",
+        "    batch_size = 2\n",
+        "\n",
+        "args = [\n",
+        "    f\"--config-path={CONFIG_DIR}\",\n",
+        "    f\"--config-name={CONFIG_FILENAME}\",\n",
+        "    f\"max_epochs={epochs}\",\n",
+        "    f\"weighted_sampling_steps_per_epoch={steps_per_epoch}\",\n",
+        "    f\"batch_size={batch_size}\",\n",
+        "    f\"log_dir={codec_log_dir}\",\n",
+        "    f\"exp_manager.exp_dir={exp_dir}\",\n",
+        "    f\"+exp_manager.version={run_id}\",\n",
+        "    f\"model.log_config.log_wandb={log_to_wandb}\",\n",
+        "    f\"model.log_config.log_tensorboard={log_to_tensorboard}\",\n",
+        "    f\"model.log_config.generators.0.log_dequantized={log_dequantized}\",\n",
+        "    f\"trainer.accelerator={accelerator}\",\n",
+        "    f\"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}\",\n",
+        "    f\"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n",
+        "    f\"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n",
+        "    f\"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n",
+        "    f\"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n",
+        "    f\"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}\"\n",
+        "]\n",
+        "\n",
+        "if MODEL_CHECKPOINT_PATH is not None:\n",
+        "  args.append(f\"+init_from_nemo_model={MODEL_CHECKPOINT_PATH}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Bn8lQG0PxWGi"
+      },
+      "outputs": [],
+      "source": [
+        "# If an error occurs, log the entire stacktrace.\n",
+        "os.environ[\"HYDRA_FULL_ERROR\"] = \"1\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yUxFCNrE3Ywi"
+      },
+      "outputs": [],
+      "source": [
+        "# Do the model training. For some configurations this step might hang when using CPU.\n",
+        "run_script(audio_codec_training_script, args)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BBPIpS-lL6z9"
+      },
+      "source": [
+        "During training, the model will automatically save predictions for all audio files specified in the `log_ds_meta` manifest."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rSFOm1Sg46Lh"
+      },
+      "outputs": [],
+      "source": [
+        "codec_log_epoch_dir = codec_log_dir / \"epoch_10\" / dataset_name\n",
+        "!ls $codec_log_epoch_dir"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oCJs7oCLMIjD"
+      },
+      "source": [
+        "This makes it easy to listen to the audio to determine how well the model is performing. We can decide to stop training when either:\n",
+        "\n",
+        "*   The predicted audio sounds almost identical to the original audio.\n",
+        "*   The predicted audio stops improving in between epochs.\n",
+        "\n",
+        "**Note that when training from scratch, the dataset in this tutorial is too small to get good audio quality.**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "G6k4ymzfJ5Y6"
+      },
+      "outputs": [],
+      "source": [
+        "audio_filepath_ground_truth = output_audio_dir / \"p228_009.flac\"\n",
+        "audio_filepath_reconstructed = codec_log_epoch_dir / \"p228_009_audio_out.wav\"\n",
+        "\n",
+        "print(\"Ground truth audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath_ground_truth, rate=SAMPLE_RATE))\n",
+        "\n",
+        "print(\"Reconstructed audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath_reconstructed, rate=SAMPLE_RATE))\n",
+        "\n",
+        "dequantized_filepath = codec_log_epoch_dir / \"p228_009_dequantized.png\"\n",
+        "ipd.Image(dequantized_filepath)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Related Information"
+      ],
+      "metadata": {
+        "id": "rynZYwg2VP5d"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "To learn more about audio codec models in NeMo, look at our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tts/models.html#codecs).\n",
+        "\n",
+        "For more information on how to download and run pre-trained audio codec models, visit [NGC](https://catalog.ngc.nvidia.com/models?filters=&orderBy=scoreDESC&query=codec)."
+      ],
+      "metadata": {
+        "id": "_LtyHHuLkNDv"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# References"
+      ],
+      "metadata": {
+        "id": "LeqV3VvJVOb-"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "1.   [EnCodec](https://arxiv.org/abs/2210.13438)\n",
+        "2.   [Finite Scalar Quantization (FSQ)](https://arxiv.org/abs/2309.15505)\n",
+        "3.   [HiFi-GAN](https://arxiv.org/abs/2010.05646)\n",
+        "4.   [SEANet](https://arxiv.org/abs/2009.02095)"
+      ],
+      "metadata": {
+        "id": "Rvu4w2x_3RSY"
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}